In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
#Importing the important library
In [4]:
df=pd.read_excel('file:///C:/Users/Ritesh/Desktop/Notes/Datasets%20for%20Python/NBFC Loan Transaction Data.xlsx')
#Loading of the data
In [5]:
df.head()
#First 5 records
Out[5]:
AGREEMENTID AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE CURRENT_INTEREST_RATE_MAX CURRENT_INTEREST_RATE_MIN CURRENT_INTEREST_RATE_CHANGES ... PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT PRODUCT SCHEMEID NPA_IN_LAST_MONTH NPA_IN_CURRENT_MONTH MOB FORECLOSURE
0 11220001 2010-08-29 0.0 0 MUMBAI 45 13.421934 13.734072 13.421934 1 ... 1.159366e+07 7994.273589 0.0 7994.273589 HL 10901100.0 NaN NaN 45 1
1 11220002 2010-09-15 0.0 99 MUMBAI 38 13.734072 13.734072 13.734072 0 ... 6.942656e+06 100504.575864 0.0 100504.575864 HL 10901100.0 NaN NaN 38 1
2 11220006 2010-11-02 0.0 231 MUMBAI 81 15.606900 16.231176 15.606900 2 ... 9.392115e+05 5947.630536 0.0 5947.630536 HL 10901101.0 NaN NaN 81 1
3 11220008 2010-10-06 0.0 0 THANE 91 11.236968 13.734072 11.236968 4 ... 9.979637e+06 60322.180776 0.0 60322.180776 HL 10901100.0 NaN NaN 91 1
4 11220010 2010-10-26 0.0 215 MUMBAI 89 14.982624 15.606900 14.982624 2 ... 5.552847e+05 27732.787464 0.0 27732.787464 HL 10901101.0 NaN NaN 90 1

5 rows × 53 columns

In [6]:
print(df['AUTHORIZATIONDATE'].min())
print(df['AUTHORIZATIONDATE'].max())
print(df['LAST_RECEIPT_DATE'].min())
print(df['LAST_RECEIPT_DATE'].max())
2010-08-29 00:00:00
2018-12-31 00:00:00
2013-10-10 00:00:00
2019-01-01 00:00:00
In [7]:
#We can see that there are total 53 columns. And looking atthe various columns we can see that there are null values present
#in certain columns. More over looking at datatype of the column , datatype looks good.
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20012 entries, 0 to 20011
Data columns (total 53 columns):
AGREEMENTID                            20012 non-null int64
AUTHORIZATIONDATE                      20012 non-null datetime64[ns]
BALANCE_EXCESS                         20012 non-null float64
BALANCE_TENURE                         20012 non-null int64
CITY                                   20012 non-null object
COMPLETED_TENURE                       20012 non-null int64
CURRENT_INTEREST_RATE                  20012 non-null float64
CURRENT_INTEREST_RATE_MAX              20012 non-null float64
CURRENT_INTEREST_RATE_MIN              20012 non-null float64
CURRENT_INTEREST_RATE_CHANGES          20012 non-null int64
CURRENT_TENOR                          20012 non-null int64
CUSTOMERID                             19731 non-null float64
DIFF_AUTH_INT_DATE                     20012 non-null int64
DIFF_CURRENT_INTEREST_RATE_MAX_MIN     20012 non-null float64
DIFF_EMI_AMOUNT_MAX_MIN                19923 non-null float64
DIFF_ORIGINAL_CURRENT_INTEREST_RATE    20012 non-null float64
DIFF_ORIGINAL_CURRENT_TENOR            20012 non-null int64
DPD                                    20012 non-null int64
DUEDAY                                 20012 non-null int64
EMI_AMOUNT                             20012 non-null float64
EMI_DUEAMT                             20012 non-null float64
EMI_OS_AMOUNT                          20012 non-null float64
EMI_RECEIVED_AMT                       20012 non-null float64
EXCESS_ADJUSTED_AMT                    20012 non-null float64
EXCESS_AVAILABLE                       20012 non-null float64
FOIR                                   20012 non-null float64
INTEREST_START_DATE                    20012 non-null datetime64[ns]
LAST_RECEIPT_AMOUNT                    19765 non-null float64
LAST_RECEIPT_DATE                      19937 non-null datetime64[ns]
LATEST_TRANSACTION_MONTH               19937 non-null float64
LOAN_AMT                               20012 non-null float64
MAX_EMI_AMOUNT                         19923 non-null float64
MIN_EMI_AMOUNT                         19923 non-null float64
MONTHOPENING                           20012 non-null float64
NET_DISBURSED_AMT                      20012 non-null float64
NET_LTV                                20012 non-null float64
NET_RECEIVABLE                         20012 non-null float64
NUM_EMI_CHANGES                        20012 non-null int64
NUM_LOW_FREQ_TRANSACTIONS              20012 non-null int64
ORIGNAL_INTEREST_RATE                  20012 non-null float64
ORIGNAL_TENOR                          20012 non-null int64
OUTSTANDING_PRINCIPAL                  20012 non-null float64
PAID_INTEREST                          20012 non-null float64
PAID_PRINCIPAL                         20012 non-null float64
PRE_EMI_DUEAMT                         20012 non-null float64
PRE_EMI_OS_AMOUNT                      20012 non-null float64
PRE_EMI_RECEIVED_AMT                   20012 non-null float64
PRODUCT                                20012 non-null object
SCHEMEID                               19731 non-null float64
NPA_IN_LAST_MONTH                      119 non-null object
NPA_IN_CURRENT_MONTH                   119 non-null object
MOB                                    20012 non-null int64
FORECLOSURE                            20012 non-null int64
dtypes: datetime64[ns](3), float64(32), int64(14), object(4)
memory usage: 8.1+ MB
In [8]:
df.shape
Out[8]:
(20012, 53)
In [9]:
pd.options.display.max_columns = None
#To display all the columns.
In [11]:
df.describe().T
Out[11]:
count mean std min 25% 50% 75% max
AGREEMENTID 20012.0 1.123665e+07 9.626440e+03 1.122000e+07 1.122833e+07 1.123661e+07 1.124496e+07 1.125335e+07
BALANCE_EXCESS 20012.0 7.899598e+04 1.348636e+06 0.000000e+00 0.000000e+00 0.000000e+00 5.742235e+01 7.555600e+07
BALANCE_TENURE 20012.0 1.728246e+02 6.400448e+01 0.000000e+00 1.360000e+02 1.740000e+02 2.160000e+02 6.740000e+02
COMPLETED_TENURE 20012.0 1.726909e+01 1.648628e+01 0.000000e+00 6.000000e+00 1.200000e+01 2.500000e+01 9.800000e+01
CURRENT_INTEREST_RATE 20012.0 1.478193e+01 2.485858e+00 9.901017e+00 1.279766e+01 1.454563e+01 1.623118e+01 2.509590e+01
CURRENT_INTEREST_RATE_MAX 20012.0 1.490025e+01 2.480029e+00 1.042541e+01 1.310980e+01 1.467049e+01 1.654331e+01 3.745656e+01
CURRENT_INTEREST_RATE_MIN 20012.0 1.430187e+01 2.677014e+00 -5.056636e+00 1.242309e+01 1.373407e+01 1.616875e+01 2.403463e+01
CURRENT_INTEREST_RATE_CHANGES 20012.0 7.580951e-01 1.134323e+00 0.000000e+00 0.000000e+00 0.000000e+00 2.000000e+00 9.000000e+00
CURRENT_TENOR 20012.0 1.900937e+02 5.855995e+01 6.000000e+00 1.660000e+02 1.800000e+02 2.280000e+02 7.130000e+02
CUSTOMERID 19731.0 1.201741e+07 9.490084e+03 1.200100e+07 1.200921e+07 1.201740e+07 1.202562e+07 1.203390e+07
DIFF_AUTH_INT_DATE 20012.0 6.296222e-03 5.696331e-01 -1.700000e+01 0.000000e+00 0.000000e+00 0.000000e+00 7.000000e+01
DIFF_CURRENT_INTEREST_RATE_MAX_MIN 20012.0 5.983747e-01 9.669352e-01 0.000000e+00 0.000000e+00 0.000000e+00 1.186124e+00 2.434676e+01
DIFF_EMI_AMOUNT_MAX_MIN 19923.0 1.152094e+05 9.670824e+05 0.000000e+00 1.020700e+04 1.988500e+04 4.246649e+04 8.496825e+07
DIFF_ORIGINAL_CURRENT_INTEREST_RATE 20012.0 -3.805037e-01 8.811203e-01 -7.179174e+00 -1.186124e+00 0.000000e+00 0.000000e+00 1.032446e+01
DIFF_ORIGINAL_CURRENT_TENOR 20012.0 -6.796372e+00 3.352576e+01 -4.610000e+02 -1.400000e+01 0.000000e+00 0.000000e+00 2.340000e+02
DPD 20012.0 7.574056e+00 6.609890e+01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 2.054000e+03
DUEDAY 20012.0 5.776634e+00 2.719009e+00 1.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 1.500000e+01
EMI_AMOUNT 20012.0 4.360950e+04 1.131318e+05 0.000000e+00 1.068500e+04 1.893750e+04 3.642400e+04 4.879479e+06
EMI_DUEAMT 20012.0 1.991553e+06 6.838394e+06 0.000000e+00 2.040216e+05 5.450651e+05 1.481417e+06 3.546104e+08
EMI_OS_AMOUNT 20012.0 3.329735e+04 6.561311e+05 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 5.899531e+07
EMI_RECEIVED_AMT 20012.0 1.958256e+06 6.762984e+06 0.000000e+00 2.020936e+05 5.376576e+05 1.456414e+06 3.546104e+08
EXCESS_ADJUSTED_AMT 20012.0 3.599002e+05 3.923346e+06 0.000000e+00 0.000000e+00 0.000000e+00 2.606091e+02 2.841642e+08
EXCESS_AVAILABLE 20012.0 4.388962e+05 4.169759e+06 0.000000e+00 0.000000e+00 2.606091e+02 3.105009e+03 2.841642e+08
FOIR 20012.0 2.796003e+01 3.871065e+03 -1.703300e+02 4.100000e-01 5.200000e-01 6.800000e-01 5.476160e+05
LAST_RECEIPT_AMOUNT 19765.0 8.067446e+04 8.084027e+05 1.000000e+00 1.106100e+04 1.964200e+04 3.821900e+04 8.496881e+07
LATEST_TRANSACTION_MONTH 19937.0 1.069223e+01 2.821409e+00 1.000000e+00 1.200000e+01 1.200000e+01 1.200000e+01 1.200000e+01
LOAN_AMT 20012.0 5.897355e+06 1.298566e+07 3.753239e+04 1.558947e+06 2.684572e+06 5.233436e+06 4.245665e+08
MAX_EMI_AMOUNT 19923.0 1.222544e+05 9.704516e+05 1.334000e+01 1.331800e+04 2.360000e+04 4.936050e+04 8.496881e+07
MIN_EMI_AMOUNT 19923.0 7.045026e+03 4.342549e+04 1.000000e-02 1.180000e+02 1.331800e+02 3.334000e+03 3.156965e+06
MONTHOPENING 20012.0 5.447511e+06 1.183851e+07 0.000000e+00 1.483752e+06 2.503694e+06 4.791778e+06 3.818367e+08
NET_DISBURSED_AMT 20012.0 5.847666e+06 1.291193e+07 3.753239e+04 1.544083e+06 2.640779e+06 5.186725e+06 4.245665e+08
NET_LTV 20012.0 5.118924e+01 2.110683e+01 3.800000e-01 3.516000e+01 5.330000e+01 6.677000e+01 1.000000e+02
NET_RECEIVABLE 20012.0 -4.543915e+04 1.348502e+06 -7.534554e+07 -1.766842e+01 0.000000e+00 0.000000e+00 3.864350e+07
NUM_EMI_CHANGES 20012.0 2.949830e+00 2.635500e+00 -1.000000e+00 2.000000e+00 2.000000e+00 4.000000e+00 3.300000e+01
NUM_LOW_FREQ_TRANSACTIONS 20012.0 2.769139e+00 2.571271e+00 0.000000e+00 1.000000e+00 2.000000e+00 3.000000e+00 3.000000e+01
ORIGNAL_INTEREST_RATE 20012.0 1.440143e+01 2.603265e+00 9.651307e+00 1.248552e+01 1.373407e+01 1.616875e+01 2.778028e+01
ORIGNAL_TENOR 20012.0 1.832973e+02 4.460026e+01 1.400000e+01 1.800000e+02 1.800000e+02 2.280000e+02 3.000000e+02
OUTSTANDING_PRINCIPAL 20012.0 5.212982e+06 1.152135e+07 -7.506479e-01 1.428919e+06 2.394655e+06 4.551204e+06 3.818367e+08
PAID_INTEREST 20012.0 9.890547e+05 3.026053e+06 0.000000e+00 1.253319e+05 3.097248e+05 7.954680e+05 1.230362e+08
PAID_PRINCIPAL 20012.0 8.667637e+05 3.469758e+07 0.000000e+00 2.341834e+04 7.878650e+04 2.917810e+05 4.885217e+09
PRE_EMI_DUEAMT 20012.0 5.780447e+04 3.776647e+05 0.000000e+00 4.768264e+03 1.069602e+04 3.187879e+04 3.177540e+07
PRE_EMI_OS_AMOUNT 20012.0 2.594779e+02 1.096744e+04 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.074264e+06
PRE_EMI_RECEIVED_AMT 20012.0 5.754499e+04 3.769718e+05 0.000000e+00 4.755012e+03 1.067945e+04 3.180536e+04 3.177540e+07
SCHEMEID 19731.0 1.090122e+07 8.890519e+01 1.090110e+07 1.090111e+07 1.090126e+07 1.090129e+07 1.090146e+07
MOB 20012.0 1.881361e+01 1.654188e+01 0.000000e+00 7.000000e+00 1.300000e+01 2.600000e+01 9.800000e+01
FORECLOSURE 20012.0 8.969618e-02 2.857531e-01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00
In [12]:
df[df['LOAN_AMT'] - df['NET_DISBURSED_AMT'] > 0].head(10)
Out[12]:
AGREEMENTID AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE CURRENT_INTEREST_RATE_MAX CURRENT_INTEREST_RATE_MIN CURRENT_INTEREST_RATE_CHANGES CURRENT_TENOR CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_EMI_AMOUNT_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EMI_RECEIVED_AMT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR INTEREST_START_DATE LAST_RECEIPT_AMOUNT LAST_RECEIPT_DATE LATEST_TRANSACTION_MONTH LOAN_AMT MAX_EMI_AMOUNT MIN_EMI_AMOUNT MONTHOPENING NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_INTEREST_RATE ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PAID_INTEREST PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT PRODUCT SCHEMEID NPA_IN_LAST_MONTH NPA_IN_CURRENT_MONTH MOB FORECLOSURE
134 11220225 2011-08-19 1964.462833 246 DELHI 48 12.797658 13.734072 12.797658 2 294 12001224.0 0 0.936414 43197.00 0.000000 6 0 1 59111.0 5.738306e+06 130549.717272 5.607756e+06 7.880180e+05 7.899824e+05 0.60 2011-08-19 59111.0 2016-07-26 7.0 1.015476e+07 100000.0 56803.00 9.245573e+06 9.588536e+06 80.00 128585.254439 3 4 12.797658 300 8.795184e+06 3.655802e+06 7.933518e+05 2.748150e+05 0.0 2.748150e+05 HL 10901160.0 NaN NaN 59 0
1034 11221690 2013-01-31 0.000000 192 MUMBAI 0 14.046210 14.046210 14.046210 0 192 12002689.0 0 0.000000 152400.62 -0.936414 0 0 5 137768.0 0.000000e+00 0.000000 0.000000e+00 2.865994e+05 2.865994e+05 0.57 2013-01-31 152768.0 2015-05-05 5.0 2.552203e+07 152768.0 367.38 2.446399e+07 2.446399e+07 78.61 0.000000 5 2 13.109796 192 2.446399e+07 4.888089e+06 0.000000e+00 7.190853e+06 0.0 7.190853e+06 HL 10901148.0 NaN NaN 28 0
1257 11222072 2013-05-09 0.000000 156 MUMBAI 24 16.543314 16.543314 16.543314 0 180 12003071.0 0 0.000000 47983.52 -0.936414 0 0 5 48144.0 2.551885e+06 0.000000 2.551885e+06 1.063285e+05 1.063285e+05 0.76 2013-05-09 48144.0 2015-06-05 6.0 5.909252e+06 48144.0 160.48 5.639402e+06 5.864213e+06 25.47 0.000000 1 1 15.606900 180 5.625757e+06 1.564847e+06 2.384562e+05 8.087717e+04 0.0 8.087717e+04 LAP 10901121.0 NaN NaN 25 0
1298 11222131 2013-05-30 0.000000 194 THANE 13 17.791866 17.791866 17.791866 0 207 12003130.0 0 0.000000 0.00 -0.936414 -27 0 5 30235.0 8.680824e+05 0.000000 8.680824e+05 6.677557e+04 6.677557e+04 0.81 2013-05-30 30235.0 2014-07-05 7.0 3.541129e+06 30235.0 30235.00 3.432420e+06 3.496090e+06 66.63 0.000000 0 1 16.855452 180 3.432420e+06 5.342887e+05 6.366995e+04 1.157281e+04 0.0 1.157281e+04 LAP 10901121.0 NaN NaN 14 0
1301 11222135 2013-06-24 0.000000 243 FARIDABAD 36 13.609217 13.921355 13.609217 1 279 12003134.0 0 0.312138 139200.00 -0.624276 -39 56 5 138897.0 1.104340e+07 607071.313896 1.043633e+07 3.123802e+05 3.123802e+05 2.07 2013-06-24 140000.0 2016-07-25 7.0 2.251643e+07 140000.0 800.00 2.043646e+07 2.102766e+07 57.33 607071.313896 8 8 12.984941 240 2.043646e+07 6.569879e+06 5.911953e+05 9.830265e+04 0.0 9.830265e+04 HL 10901178.0 NaN NaN 37 0
1303 11222137 2013-05-15 0.000000 49 MUMBAI 13 16.855452 16.855452 16.855452 0 62 12003136.0 0 0.000000 0.00 -0.936414 -2 0 5 307616.0 8.832017e+06 0.000000 8.832017e+06 6.793859e+05 6.793859e+05 0.82 2013-05-15 307616.0 2014-07-05 7.0 2.045677e+07 307616.0 307616.00 1.712632e+07 2.041173e+07 42.15 0.000000 0 2 15.919038 60 1.712632e+07 2.870095e+06 3.285412e+06 2.233310e+05 0.0 2.233310e+05 LAP 10901115.0 NaN NaN 14 0
1311 11222151 2013-05-27 0.441710 194 MUMBAI 11 16.855452 16.855452 16.855452 0 205 12003150.0 0 0.000000 3142542.00 -0.936414 -25 0 5 61458.0 1.493065e+06 0.000000 1.493065e+06 7.076201e+06 7.076201e+06 0.58 2013-05-27 61458.0 2014-05-05 5.0 7.433129e+06 3204000.0 61458.00 7.178688e+06 7.388090e+06 15.89 -0.441710 1 2 15.919038 180 2.368536e+06 8.290816e+05 5.019554e+06 3.464556e+04 0.0 3.464556e+04 LAP 10901153.0 NaN NaN 12 0
1400 11222298 2013-06-30 0.000000 180 DELHI 0 13.734072 13.734072 13.734072 0 180 12003297.0 0 0.000000 45365.06 -0.936414 0 0 5 45427.0 0.000000e+00 0.000000 0.000000e+00 1.003279e+05 1.003279e+05 0.35 2013-06-30 45427.0 2014-06-05 6.0 1.426231e+07 45427.0 61.94 7.439900e+06 7.439900e+06 28.21 0.000000 1 1 12.797658 180 7.439900e+06 7.514856e+05 0.000000e+00 1.105508e+06 0.0 1.105508e+06 HL 10901148.0 NaN NaN 12 0
1479 11222448 2013-07-24 0.000000 80 TIRUCHIRAPPALLI 34 18.104004 18.104004 18.104004 0 114 12003447.0 0 0.000000 67986.00 0.000000 66 0 15 68276.0 5.126897e+06 0.000000 5.126897e+06 2.258183e+06 2.258183e+06 0.57 2013-07-24 290.0 2016-06-29 6.0 7.588478e+06 68276.0 290.00 6.716261e+06 7.506479e+06 24.54 0.000000 1 2 18.104004 180 5.132966e+06 2.679384e+06 2.373513e+06 9.785210e+04 0.0 9.785210e+04 STHL 10901112.0 NaN NaN 35 1
1525 11222522 2013-08-27 0.000000 240 GHAZIABAD 0 14.670486 14.670486 14.670486 0 240 12003521.0 0 0.000000 1959.00 -0.936414 0 0 5 13602.0 0.000000e+00 0.000000 0.000000e+00 3.436728e+04 3.436728e+04 0.50 2013-08-27 15561.0 2014-03-05 3.0 4.526407e+06 15561.0 13602.00 2.385685e+06 2.385685e+06 75.97 0.000000 2 1 13.734072 240 2.385685e+06 1.354154e+05 0.000000e+00 1.992092e+05 0.0 1.992092e+05 HL 10901148.0 NaN NaN 7 0
Establishing relationship between different columns . Below are the columns which are derived from the other columns. This information
will help us in performing feature engineering.

PRE_EMI_DUEAMT= PRE_EMI_RECEIVED_AMT+PRE_EMI_OS_AMOUNT

OUTSTANDING_PRINCIPAL+ PAID_PRINCIPAL = LOAN_AMT

BALANCE_TENURE + COMPLETED_TENURE = CURRENT_TENOR

ORIGNAL_TENOR - CURRENT_TENOR = DIFF_ORIGINAL_CURRENT_TENOR

EMI_OS_AMOUNT - BALANCE_EXCESS = NET_RECEIVABLE

PRE_EMI_DUEAMT - PRE_EMI_OS_AMOUNT = PRE_EMI_RECEIVED_AMT

INTEREST_START_DATE - AUTHORIZATIONDATE = DIFF_AUTH_INT_DATE

CURRENT_INTEREST_RATE_MAX - CURRENT_INTEREST_RATE_MIN = DIFF_CURRENT_INTEREST_RATE_MAX_MIN

MAX_EMI_AMOUNT - MIN_EMI_AMOUNT = DIFF_EMI_AMOUNT_MAX_MIN

ORIGNAL_INTEREST_RATE - CURRENT_INTEREST_RATE = DIFF_ORIGINAL_CURRENT_INTEREST_RATE

BALANCE_TENURE,COMPLETED_TENURE,CURRENT_TENOR,DIFF_ORIGINAL_CURRENT_TENOR,ORIGNAL_TENOR

CHECKING THE DIFFERENT RELATIONSHIP BETWEEN THESE COLUMNS

In [13]:
df[['BALANCE_TENURE','COMPLETED_TENURE','CURRENT_TENOR','DIFF_ORIGINAL_CURRENT_TENOR','ORIGNAL_TENOR']].head()
Out[13]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_TENOR DIFF_ORIGINAL_CURRENT_TENOR ORIGNAL_TENOR
0 0 45 45 75 120
1 99 38 137 43 180
2 231 81 312 -132 180
3 0 91 91 89 180
4 215 89 304 -124 180
In [14]:
df[['EMI_OS_AMOUNT','BALANCE_EXCESS','NET_RECEIVABLE']].head()
Out[14]:
EMI_OS_AMOUNT BALANCE_EXCESS NET_RECEIVABLE
0 175477.782641 0.0 175477.782641
1 279448.084560 0.0 279448.084560
2 0.000000 0.0 0.000000
3 63659.280762 0.0 63659.280762
4 0.000000 0.0 0.000000
In [15]:
df[['EMI_DUEAMT','EMI_OS_AMOUNT','EMI_RECEIVED_AMT']].head()
Out[15]:
EMI_DUEAMT EMI_OS_AMOUNT EMI_RECEIVED_AMT
0 8.614898e+06 175477.782641 8.439420e+06
1 1.061903e+07 279448.084560 1.033958e+07
2 4.670211e+06 0.000000 4.670211e+06
3 1.313098e+07 63659.280762 1.306732e+07
4 1.048923e+07 0.000000 1.048923e+07
In [16]:
df[['DIFF_AUTH_INT_DATE','AUTHORIZATIONDATE','INTEREST_START_DATE']].head()
Out[16]:
DIFF_AUTH_INT_DATE AUTHORIZATIONDATE INTEREST_START_DATE
0 1 2010-08-29 2010-08-30
1 0 2010-09-15 2010-09-15
2 -1 2010-11-02 2010-11-01
3 0 2010-10-06 2010-10-06
4 0 2010-10-26 2010-10-26
In [17]:
df[['MAX_EMI_AMOUNT','MIN_EMI_AMOUNT','DIFF_EMI_AMOUNT_MAX_MIN']].head()
Out[17]:
MAX_EMI_AMOUNT MIN_EMI_AMOUNT DIFF_EMI_AMOUNT_MAX_MIN
0 1000000.0 83559.0 916441.0
1 126530.0 126530.0 0.0
2 500000.0 22878.0 477122.0
3 1500000.0 65741.0 1434259.0
4 54433.0 54433.0 0.0
In [18]:
df[['CURRENT_INTEREST_RATE_MAX','CURRENT_INTEREST_RATE_MIN','DIFF_CURRENT_INTEREST_RATE_MAX_MIN']].head()
Out[18]:
CURRENT_INTEREST_RATE_MAX CURRENT_INTEREST_RATE_MIN DIFF_CURRENT_INTEREST_RATE_MAX_MIN
0 13.734072 13.421934 0.312138
1 13.734072 13.734072 0.000000
2 16.231176 15.606900 0.624276
3 13.734072 11.236968 2.497104
4 15.606900 14.982624 0.624276
In [19]:
df[['ORIGNAL_INTEREST_RATE','CURRENT_INTEREST_RATE','DIFF_ORIGINAL_CURRENT_INTEREST_RATE']].head()
Out[19]:
ORIGNAL_INTEREST_RATE CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_INTEREST_RATE
0 10.612692 13.421934 -2.809242
1 10.612692 13.734072 -3.121380
2 11.549106 15.606900 -4.057794
3 10.612692 11.236968 -0.624276
4 10.924830 14.982624 -4.057794
In [20]:
df[['BALANCE_EXCESS','EXCESS_AVAILABLE','EXCESS_ADJUSTED_AMT']].head(10)
Out[20]:
BALANCE_EXCESS EXCESS_AVAILABLE EXCESS_ADJUSTED_AMT
0 0.000000 2.135869e+05 2.135869e+05
1 0.000000 0.000000e+00 0.000000e+00
2 0.000000 1.179716e+06 1.179716e+06
3 0.000000 6.770848e+06 6.770848e+06
4 0.000000 1.202181e+05 1.202181e+05
5 0.000000 2.765549e+04 2.765549e+04
6 0.000000 9.553313e+04 9.553313e+04
7 0.000000 4.643083e+05 4.643083e+05
8 0.000000 4.529950e+06 4.529950e+06
9 9988.419361 2.674528e+04 1.675686e+04
In [21]:
(df['COMPLETED_TENURE'] + df['DIFF_ORIGINAL_CURRENT_TENOR']).head()
Out[21]:
0    120
1     81
2    -51
3    180
4    -35
dtype: int64
In [23]:
df[df['PRE_EMI_DUEAMT'] - df['PRE_EMI_RECEIVED_AMT'] > 0][['PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT','PRE_EMI_RECEIVED_AMT']].head()
Out[23]:
PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT
1970 579486.499416 108939.897287 470546.602129
2976 35339.040552 23558.624184 11780.416368
3037 522050.896104 118056.004865 403994.891239
3153 612784.837920 340165.596144 272619.241776
3734 324657.144000 47976.375096 276680.768904
In [24]:
df.shape
Out[24]:
(20012, 53)

CHECKING AND DROPPING THE NULL VALUES

In [25]:
df.isnull().sum()
#Null values are present.NPA_IN_LAST_MONTH & NPA_IN_CURRENT_MONTH  are two columns having excess null values. And it would 
#be better to drop them off.
Out[25]:
AGREEMENTID                                0
AUTHORIZATIONDATE                          0
BALANCE_EXCESS                             0
BALANCE_TENURE                             0
CITY                                       0
COMPLETED_TENURE                           0
CURRENT_INTEREST_RATE                      0
CURRENT_INTEREST_RATE_MAX                  0
CURRENT_INTEREST_RATE_MIN                  0
CURRENT_INTEREST_RATE_CHANGES              0
CURRENT_TENOR                              0
CUSTOMERID                               281
DIFF_AUTH_INT_DATE                         0
DIFF_CURRENT_INTEREST_RATE_MAX_MIN         0
DIFF_EMI_AMOUNT_MAX_MIN                   89
DIFF_ORIGINAL_CURRENT_INTEREST_RATE        0
DIFF_ORIGINAL_CURRENT_TENOR                0
DPD                                        0
DUEDAY                                     0
EMI_AMOUNT                                 0
EMI_DUEAMT                                 0
EMI_OS_AMOUNT                              0
EMI_RECEIVED_AMT                           0
EXCESS_ADJUSTED_AMT                        0
EXCESS_AVAILABLE                           0
FOIR                                       0
INTEREST_START_DATE                        0
LAST_RECEIPT_AMOUNT                      247
LAST_RECEIPT_DATE                         75
LATEST_TRANSACTION_MONTH                  75
LOAN_AMT                                   0
MAX_EMI_AMOUNT                            89
MIN_EMI_AMOUNT                            89
MONTHOPENING                               0
NET_DISBURSED_AMT                          0
NET_LTV                                    0
NET_RECEIVABLE                             0
NUM_EMI_CHANGES                            0
NUM_LOW_FREQ_TRANSACTIONS                  0
ORIGNAL_INTEREST_RATE                      0
ORIGNAL_TENOR                              0
OUTSTANDING_PRINCIPAL                      0
PAID_INTEREST                              0
PAID_PRINCIPAL                             0
PRE_EMI_DUEAMT                             0
PRE_EMI_OS_AMOUNT                          0
PRE_EMI_RECEIVED_AMT                       0
PRODUCT                                    0
SCHEMEID                                 281
NPA_IN_LAST_MONTH                      19893
NPA_IN_CURRENT_MONTH                   19893
MOB                                        0
FORECLOSURE                                0
dtype: int64
In [26]:
df.duplicated().sum()
#Theres no duplicate records
Out[26]:
0
In [27]:
df.drop(['NPA_IN_LAST_MONTH','NPA_IN_CURRENT_MONTH'],axis=1,inplace=True)
#Dropping these column as it has lot of null values
In [28]:
df.dropna(inplace=True)
#Dropping the rows having null values in it
In [29]:
df.isnull().sum()
#NO NULL VALUES ARE PRESENT
Out[29]:
AGREEMENTID                            0
AUTHORIZATIONDATE                      0
BALANCE_EXCESS                         0
BALANCE_TENURE                         0
CITY                                   0
COMPLETED_TENURE                       0
CURRENT_INTEREST_RATE                  0
CURRENT_INTEREST_RATE_MAX              0
CURRENT_INTEREST_RATE_MIN              0
CURRENT_INTEREST_RATE_CHANGES          0
CURRENT_TENOR                          0
CUSTOMERID                             0
DIFF_AUTH_INT_DATE                     0
DIFF_CURRENT_INTEREST_RATE_MAX_MIN     0
DIFF_EMI_AMOUNT_MAX_MIN                0
DIFF_ORIGINAL_CURRENT_INTEREST_RATE    0
DIFF_ORIGINAL_CURRENT_TENOR            0
DPD                                    0
DUEDAY                                 0
EMI_AMOUNT                             0
EMI_DUEAMT                             0
EMI_OS_AMOUNT                          0
EMI_RECEIVED_AMT                       0
EXCESS_ADJUSTED_AMT                    0
EXCESS_AVAILABLE                       0
FOIR                                   0
INTEREST_START_DATE                    0
LAST_RECEIPT_AMOUNT                    0
LAST_RECEIPT_DATE                      0
LATEST_TRANSACTION_MONTH               0
LOAN_AMT                               0
MAX_EMI_AMOUNT                         0
MIN_EMI_AMOUNT                         0
MONTHOPENING                           0
NET_DISBURSED_AMT                      0
NET_LTV                                0
NET_RECEIVABLE                         0
NUM_EMI_CHANGES                        0
NUM_LOW_FREQ_TRANSACTIONS              0
ORIGNAL_INTEREST_RATE                  0
ORIGNAL_TENOR                          0
OUTSTANDING_PRINCIPAL                  0
PAID_INTEREST                          0
PAID_PRINCIPAL                         0
PRE_EMI_DUEAMT                         0
PRE_EMI_OS_AMOUNT                      0
PRE_EMI_RECEIVED_AMT                   0
PRODUCT                                0
SCHEMEID                               0
MOB                                    0
FORECLOSURE                            0
dtype: int64
In [30]:
df['AGREEMENTID'].nunique()
#We can drop this unique identifier column as this will not be used for our analysis.
Out[30]:
19492
In [31]:
df.shape
Out[31]:
(19492, 51)
In [32]:
df.drop(['AGREEMENTID'],axis=1,inplace=True)
# Dropping of unique column.
In [33]:
df.head()
Out[33]:
AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE CURRENT_INTEREST_RATE_MAX CURRENT_INTEREST_RATE_MIN CURRENT_INTEREST_RATE_CHANGES CURRENT_TENOR CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_EMI_AMOUNT_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EMI_RECEIVED_AMT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR INTEREST_START_DATE LAST_RECEIPT_AMOUNT LAST_RECEIPT_DATE LATEST_TRANSACTION_MONTH LOAN_AMT MAX_EMI_AMOUNT MIN_EMI_AMOUNT MONTHOPENING NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_INTEREST_RATE ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PAID_INTEREST PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT PRODUCT SCHEMEID MOB FORECLOSURE
0 2010-08-29 0.0 0 MUMBAI 45 13.421934 13.734072 13.421934 1 45 12001000.0 1 0.312138 916441.0 -2.809242 75 0 1 83559.0 8.614898e+06 175477.782641 8.439420e+06 2.135869e+05 2.135869e+05 0.60 2010-08-30 1000000.0 2014-05-05 5.0 1.171011e+07 1000000.0 83559.0 8.693575e+06 1.171011e+07 40.06 175477.782641 1 1 10.612692 120 1.164472e+05 2.725723e+06 1.159366e+07 7994.273589 0.0 7994.273589 HL 10901100.0 45 1
1 2010-09-15 0.0 99 MUMBAI 38 13.734072 13.734072 13.734072 0 137 12001001.0 0 0.000000 0.0 -3.121380 43 0 1 126530.0 1.061903e+07 279448.084560 1.033958e+07 0.000000e+00 0.000000e+00 0.60 2010-09-15 126530.0 2013-11-01 11.0 1.929025e+07 126530.0 126530.0 1.713861e+07 1.929025e+07 84.31 279448.084560 0 0 10.612692 180 1.234760e+07 4.945154e+06 6.942656e+06 100504.575864 0.0 100504.575864 HL 10901100.0 38 1
2 2010-11-02 0.0 231 MUMBAI 81 15.606900 16.231176 15.606900 2 312 12001005.0 -1 0.624276 477122.0 -4.057794 -132 0 5 22878.0 4.670211e+06 0.000000 4.670211e+06 1.179716e+06 1.179716e+06 0.72 2010-11-01 22878.0 2017-08-05 8.0 3.933395e+06 500000.0 22878.0 3.744831e+06 3.933395e+06 50.89 0.000000 2 3 11.549106 180 2.994184e+06 2.990124e+06 9.392115e+05 5947.630536 0.0 5947.630536 HL 10901101.0 81 1
3 2010-10-06 0.0 0 THANE 91 11.236968 13.734072 11.236968 4 91 12001007.0 0 2.497104 1434259.0 -0.624276 89 0 1 65741.0 1.313098e+07 63659.280762 1.306732e+07 6.770848e+06 6.770848e+06 0.60 2010-10-06 65741.0 2018-05-02 5.0 1.002259e+07 1500000.0 65741.0 5.747875e+06 1.002259e+07 84.63 63659.280762 3 8 10.612692 180 4.295120e+04 4.648994e+06 9.979637e+06 60322.180776 0.0 60322.180776 HL 10901100.0 91 1
4 2010-10-26 0.0 215 MUMBAI 89 14.982624 15.606900 14.982624 2 304 12001009.0 0 0.624276 0.0 -4.057794 -124 0 5 54433.0 1.048923e+07 0.000000 1.048923e+07 1.202181e+05 1.202181e+05 0.83 2010-10-26 54433.0 2018-04-05 4.0 7.755937e+06 54433.0 54433.0 7.200653e+06 7.755937e+06 30.94 0.000000 0 3 10.924830 180 7.200653e+06 6.593778e+06 5.552847e+05 27732.787464 0.0 27732.787464 HL 10901101.0 90 1
In [34]:
df['FORECLOSURE'].value_counts()
Out[34]:
0    17751
1     1741
Name: FORECLOSURE, dtype: int64
In [351]:
sns.barplot(x=df['FORECLOSURE'].value_counts().index,y=df['FORECLOSURE'].value_counts())
Out[351]:
<matplotlib.axes._subplots.AxesSubplot at 0x7da195bf60>
In [35]:
df['FORECLOSURE'].value_counts(normalize=True)
Out[35]:
0    0.910681
1    0.089319
Name: FORECLOSURE, dtype: float64

UNIVARIATE AND BIVARIATE ANALYSIS

In [36]:
sns.barplot(df.groupby(df['FORECLOSURE'])['BALANCE_EXCESS'].mean().index,df.groupby(df['FORECLOSURE'])['BALANCE_EXCESS'].mean().values)
plt.ylabel('Mean Balance Excess')

#Balance excess is higher in case of foreclosure. As the collaterals are being auctioned to pay off loans 
# theres is higher probability of getting the amount greater than the outstanding amount.
Out[36]:
Text(0, 0.5, 'Mean Balance Excess')
In [37]:
df['Percentage_Completion'] = (df['COMPLETED_TENURE']/df['CURRENT_TENOR']) *100
#Let us add a new column Percentage_Completion
In [38]:
df.head()
Out[38]:
AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE CURRENT_INTEREST_RATE_MAX CURRENT_INTEREST_RATE_MIN CURRENT_INTEREST_RATE_CHANGES CURRENT_TENOR CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_EMI_AMOUNT_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EMI_RECEIVED_AMT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR INTEREST_START_DATE LAST_RECEIPT_AMOUNT LAST_RECEIPT_DATE LATEST_TRANSACTION_MONTH LOAN_AMT MAX_EMI_AMOUNT MIN_EMI_AMOUNT MONTHOPENING NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_INTEREST_RATE ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PAID_INTEREST PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT PRODUCT SCHEMEID MOB FORECLOSURE Percentage_Completion
0 2010-08-29 0.0 0 MUMBAI 45 13.421934 13.734072 13.421934 1 45 12001000.0 1 0.312138 916441.0 -2.809242 75 0 1 83559.0 8.614898e+06 175477.782641 8.439420e+06 2.135869e+05 2.135869e+05 0.60 2010-08-30 1000000.0 2014-05-05 5.0 1.171011e+07 1000000.0 83559.0 8.693575e+06 1.171011e+07 40.06 175477.782641 1 1 10.612692 120 1.164472e+05 2.725723e+06 1.159366e+07 7994.273589 0.0 7994.273589 HL 10901100.0 45 1 100.000000
1 2010-09-15 0.0 99 MUMBAI 38 13.734072 13.734072 13.734072 0 137 12001001.0 0 0.000000 0.0 -3.121380 43 0 1 126530.0 1.061903e+07 279448.084560 1.033958e+07 0.000000e+00 0.000000e+00 0.60 2010-09-15 126530.0 2013-11-01 11.0 1.929025e+07 126530.0 126530.0 1.713861e+07 1.929025e+07 84.31 279448.084560 0 0 10.612692 180 1.234760e+07 4.945154e+06 6.942656e+06 100504.575864 0.0 100504.575864 HL 10901100.0 38 1 27.737226
2 2010-11-02 0.0 231 MUMBAI 81 15.606900 16.231176 15.606900 2 312 12001005.0 -1 0.624276 477122.0 -4.057794 -132 0 5 22878.0 4.670211e+06 0.000000 4.670211e+06 1.179716e+06 1.179716e+06 0.72 2010-11-01 22878.0 2017-08-05 8.0 3.933395e+06 500000.0 22878.0 3.744831e+06 3.933395e+06 50.89 0.000000 2 3 11.549106 180 2.994184e+06 2.990124e+06 9.392115e+05 5947.630536 0.0 5947.630536 HL 10901101.0 81 1 25.961538
3 2010-10-06 0.0 0 THANE 91 11.236968 13.734072 11.236968 4 91 12001007.0 0 2.497104 1434259.0 -0.624276 89 0 1 65741.0 1.313098e+07 63659.280762 1.306732e+07 6.770848e+06 6.770848e+06 0.60 2010-10-06 65741.0 2018-05-02 5.0 1.002259e+07 1500000.0 65741.0 5.747875e+06 1.002259e+07 84.63 63659.280762 3 8 10.612692 180 4.295120e+04 4.648994e+06 9.979637e+06 60322.180776 0.0 60322.180776 HL 10901100.0 91 1 100.000000
4 2010-10-26 0.0 215 MUMBAI 89 14.982624 15.606900 14.982624 2 304 12001009.0 0 0.624276 0.0 -4.057794 -124 0 5 54433.0 1.048923e+07 0.000000 1.048923e+07 1.202181e+05 1.202181e+05 0.83 2010-10-26 54433.0 2018-04-05 4.0 7.755937e+06 54433.0 54433.0 7.200653e+06 7.755937e+06 30.94 0.000000 0 3 10.924830 180 7.200653e+06 6.593778e+06 5.552847e+05 27732.787464 0.0 27732.787464 HL 10901101.0 90 1 29.276316
In [39]:
df.describe()
Out[39]:
BALANCE_EXCESS BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE CURRENT_INTEREST_RATE_MAX CURRENT_INTEREST_RATE_MIN CURRENT_INTEREST_RATE_CHANGES CURRENT_TENOR CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_EMI_AMOUNT_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EMI_RECEIVED_AMT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH LOAN_AMT MAX_EMI_AMOUNT MIN_EMI_AMOUNT MONTHOPENING NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_INTEREST_RATE ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PAID_INTEREST PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT SCHEMEID MOB FORECLOSURE Percentage_Completion
count 1.949200e+04 19492.000000 19492.000000 19492.000000 19492.000000 19492.000000 19492.000000 19492.000000 1.949200e+04 19492.000000 19492.000000 1.949200e+04 19492.000000 19492.000000 19492.000000 19492.000000 1.949200e+04 1.949200e+04 1.949200e+04 1.949200e+04 1.949200e+04 1.949200e+04 19492.000000 1.949200e+04 19492.000000 1.949200e+04 1.949200e+04 1.949200e+04 1.949200e+04 1.949200e+04 19492.000000 1.949200e+04 19492.000000 19492.000000 19492.000000 19492.000000 1.949200e+04 1.949200e+04 1.949200e+04 1.949200e+04 1.949200e+04 1.949200e+04 1.949200e+04 19492.000000 19492.000000 19492.000000
mean 7.948855e+04 172.764621 17.553201 14.782359 14.901728 14.290696 0.774215 190.317823 1.201744e+07 0.006515 0.611031 1.153960e+05 -0.391187 -7.296429 6.999077 5.774164 4.408387e+04 2.019475e+06 3.140107e+04 1.988074e+06 3.299198e+05 4.094083e+05 28.688726 8.159529e+04 10.711625 5.867079e+06 1.223300e+05 6.933995e+03 5.417489e+06 5.821735e+06 50.988103 -4.783237e+04 3.003335 2.806023 14.391172 183.021393 5.201519e+06 1.003785e+06 8.565077e+05 5.792451e+04 2.551084e+02 5.766940e+04 1.090122e+07 19.123333 0.089319 11.351217
std 1.358587e+06 63.785806 16.432438 2.489826 2.485216 2.682558 1.141368 58.383521 9.486043e+03 0.577136 0.969852 9.725061e+05 0.885655 33.273565 60.536850 2.715239 1.139948e+05 6.904609e+06 6.393010e+05 6.831243e+06 3.501982e+06 3.780643e+06 3922.360286 8.140048e+05 2.804144 1.296993e+07 9.758556e+05 4.381522e+04 1.180354e+07 1.289796e+07 21.054703 1.346706e+06 2.618284 2.555721 2.609432 44.475040 1.154702e+07 3.055707e+06 3.512905e+07 3.733282e+05 1.105367e+04 3.726086e+05 8.890152e+01 16.461106 0.285211 13.965092
min 0.000000e+00 0.000000 0.000000 10.425409 10.425409 -5.056636 0.000000 6.000000 1.200100e+07 -17.000000 0.000000 0.000000e+00 -7.179174 -461.000000 0.000000 1.000000 1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 -170.330000 1.000000e+00 1.000000 3.753239e+04 1.334000e+01 1.000000e-02 0.000000e+00 3.753239e+04 0.380000 -7.534554e+07 0.000000 0.000000 10.113271 14.000000 -7.506479e-01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.090110e+07 0.000000 0.000000 0.000000
25% 0.000000e+00 136.000000 6.000000 12.797658 13.109796 12.423092 0.000000 166.000000 1.200926e+07 0.000000 0.000000 1.054900e+04 -1.186124 -16.000000 0.000000 5.000000 1.085400e+04 2.183137e+05 0.000000e+00 2.164767e+05 0.000000e+00 0.000000e+00 0.410000 1.116700e+04 12.000000 1.555369e+06 1.342000e+04 1.180000e+02 1.483564e+06 1.545589e+06 35.037500 -2.208552e+01 2.000000 1.000000 12.485520 180.000000 1.432082e+06 1.325291e+05 2.540117e+04 4.940531e+03 0.000000e+00 4.925071e+03 1.090111e+07 8.000000 0.000000 2.916667
50% 0.000000e+00 174.000000 12.000000 14.545631 14.857769 13.734072 0.000000 180.000000 1.201744e+07 0.000000 0.000000 2.015850e+04 0.000000 0.000000 0.000000 5.000000 1.914500e+04 5.611456e+05 0.000000e+00 5.531429e+05 0.000000e+00 2.606091e+02 0.520000 1.979600e+04 12.000000 2.679813e+06 2.369350e+04 1.180000e+02 2.501044e+06 2.639325e+06 53.020000 0.000000e+00 2.000000 2.000000 13.734072 180.000000 2.395707e+06 3.190869e+05 8.178234e+04 1.092460e+04 0.000000e+00 1.090693e+04 1.090126e+07 14.000000 0.000000 6.302521
75% 6.846511e+01 215.000000 25.000000 16.231176 16.543314 16.168748 2.000000 228.000000 1.202564e+07 0.000000 1.186124 4.302100e+04 0.000000 0.000000 0.000000 5.000000 3.676575e+04 1.503360e+06 0.000000e+00 1.480827e+06 2.606091e+02 3.170393e+03 0.680000 3.860825e+04 12.000000 5.215636e+06 4.995425e+04 3.105000e+03 4.778939e+06 5.172830e+06 66.542500 0.000000e+00 4.000000 3.000000 16.168748 228.000000 4.555872e+06 8.062026e+05 2.961328e+05 3.241216e+04 0.000000e+00 3.232437e+04 1.090129e+07 27.000000 0.000000 14.880952
max 7.555600e+07 674.000000 98.000000 25.095895 37.456560 24.034626 9.000000 713.000000 1.203390e+07 70.000000 24.346764 8.496825e+07 8.458187 234.000000 1790.000000 15.000000 4.879479e+06 3.546104e+08 5.899531e+07 3.546104e+08 2.841642e+08 2.841642e+08 547616.000000 8.496881e+07 12.000000 4.245665e+08 8.496881e+07 3.156965e+06 3.818367e+08 4.245665e+08 100.000000 3.864350e+07 33.000000 30.000000 27.780282 300.000000 3.818367e+08 1.230362e+08 4.885217e+09 3.177540e+07 1.074264e+06 3.177540e+07 1.090146e+07 98.000000 1.000000 100.000000
In [40]:
sns.barplot(df.groupby(df['FORECLOSURE'])['Percentage_Completion'].mean().index,df.groupby(df['FORECLOSURE'])['Percentage_Completion'].mean().values)
plt.ylabel('Mean Percentage Completion')

# This is a interesting insights. Person/Company which are foreclosuring has higher percentage completion rate. Which somehow
# shows us that people are not defaulting willfully but there is some reason due to which this is happening.
Out[40]:
Text(0, 0.5, 'Mean Percentage Completion')
In [353]:
sns.barplot(df.groupby(df['FORECLOSURE'])['BALANCE_TENURE'].mean().index,df.groupby(df['FORECLOSURE'])['BALANCE_TENURE'].mean().values)
plt.ylabel('Mean Balance Tenure')
# Mean completed tenure is more for person getting foreclosed.
Out[353]:
Text(0, 0.5, 'Mean Balance Tenure')
In [42]:
df.groupby(df['FORECLOSURE'])['CURRENT_TENOR'].mean()
Out[42]:
FORECLOSURE
0    190.391978
1    189.561746
Name: CURRENT_TENOR, dtype: float64
In [43]:
City_counts=pd.DataFrame(df.groupby(['FORECLOSURE','CITY'])['CUSTOMERID'].count().sort_values(ascending=False))
In [44]:
City_counts=City_counts.reset_index()
City_counts[City_counts['FORECLOSURE']==0].head()
Out[44]:
FORECLOSURE CITY CUSTOMERID
0 0 MUMBAI 1640
1 0 HYDERABAD 1381
2 0 SURAT 1323
3 0 AHMEDABAD 1280
4 0 PUNE 1026
In [45]:
City_counts[City_counts['FORECLOSURE']==1].head()
Out[45]:
FORECLOSURE CITY CUSTOMERID
12 1 MUMBAI 347
24 1 HYDERABAD 161
26 1 PUNE 147
36 1 CHENNAI 105
41 1 AHMEDABAD 87
In [46]:
sns.barplot(x=df.groupby(['FORECLOSURE'])['COMPLETED_TENURE'].mean().index,y=df.groupby(['FORECLOSURE'])['COMPLETED_TENURE'].mean().values)
plt.xlabel("Completed Tenure")
Out[46]:
Text(0.5, 0, 'Completed Tenure')
In [47]:
sns.barplot(np.array(City_counts[City_counts['FORECLOSURE']==1]['CITY'].head()),City_counts[City_counts['FORECLOSURE']==1]['CUSTOMERID'].head().values)
plt.xlabel('City with most number of FORECLOSURES')
# MUMBAI is the city recording highest number of foreclosure. Followed by Hyderabad. Closure look has to be given on these
#cities.
Out[47]:
Text(0.5, 0, 'City with most number of FORECLOSURES')
In [48]:
sns.barplot(np.array(City_counts[City_counts['FORECLOSURE']==0]['CITY'].head()),City_counts[City_counts['FORECLOSURE']==0]['CUSTOMERID'].head().values)
plt.xlabel('City with most number of non-FORECLOSURES')
# MUMBAI is the city recording highest number of non-foreclosure case as well . Followed by Hyderabad.
Out[48]:
Text(0.5, 0, 'City with most number of non-FORECLOSURES')
In [49]:
sns.barplot(df['CITY'].value_counts().index[:5],df['CITY'].value_counts().values[:5])
plt.xlabel('Cities with maximum number of loan given')
# MUMBAI AND HYDERABAD gave the highest number of loan.
Out[49]:
Text(0.5, 0, 'Cities with maximum number of loan given')
In [50]:
PRODUCT_STATUS=pd.DataFrame(df.groupby(['FORECLOSURE','PRODUCT'])['CITY'].count()).reset_index()
In [355]:
sns.barplot(x=PRODUCT_STATUS[PRODUCT_STATUS['FORECLOSURE'] == 0]['PRODUCT'],y=PRODUCT_STATUS[PRODUCT_STATUS['FORECLOSURE'] == 0]['CITY'])
plt.xlabel('Products count for non-foreclosure account')
plt.ylabel(' Counts ')
Out[355]:
Text(0, 0.5, ' Counts ')
In [354]:
sns.barplot(x=PRODUCT_STATUS[PRODUCT_STATUS['FORECLOSURE'] == 1]['PRODUCT'],y=PRODUCT_STATUS[PRODUCT_STATUS['FORECLOSURE'] == 1]['CITY'])
plt.xlabel('Products for foreclosure account')
plt.ylabel(' Counts ')
# Most of the foreclosure is seen for the PRODUCT - HL followed by STHL . A thorough inspection of customer should be 
# performed for giving away loans in the category - HL and STHL
Out[354]:
Text(0, 0.5, ' Counts ')
In [356]:
sns.barplot(df['PRODUCT'].value_counts().index,df['PRODUCT'].value_counts().values)
plt.xlabel('Product Codes')
plt.ylabel(' Product Counts ')
#Maximum number of loans has been disbursed as Short Term Home Loan.
Out[356]:
Text(0, 0.5, ' Product Counts ')
In [54]:
df['MOB'].value_counts(ascending=False).head()
Out[54]:
9     1037
7      840
12     834
10     822
4      819
Name: MOB, dtype: int64
In [55]:
sns.barplot(df['MOB'].value_counts(ascending=False).head().index,df['MOB'].value_counts(ascending=False).head())
plt.xlabel('Most popular code')
# The most popular code is coming as 9 followed by 7 and 12
Out[55]:
Text(0.5, 0, 'Most popular code')
In [56]:
plt.figure(figsize=(10,5))
sns.barplot(df['PRODUCT'], df['LOAN_AMT'],hue=df['FORECLOSURE'])
# The mean value of loan amount for Product LAP is highest. Moreover those person getting foreclosed has highest mean loan
# amount compared to others.
Out[56]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d8bf49898>
In [57]:
plt.figure(figsize=(8,5))
sns.barplot(df['PRODUCT'], df['LOAN_AMT'])
Out[57]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d8c07e550>
In [58]:
sns.barplot(df['FORECLOSURE'], df['NET_LTV'])
#  Net Loan to Value ratio looks same for both the case.
Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d8e6d18d0>
In [59]:
df['NET_LTV_RANGE']=pd.cut(df['NET_LTV'], [0,10,20,30,40,50,60,70,80,90,100], include_lowest=True)
#Introducing a new column NET_LTV_RANGE to understand under which bracket the NET_LTV is falling . This may help us to 
#get some new insights.
In [60]:
df[['NET_LTV','NET_LTV_RANGE']].head()
Out[60]:
NET_LTV NET_LTV_RANGE
0 40.06 (40.0, 50.0]
1 84.31 (80.0, 90.0]
2 50.89 (50.0, 60.0]
3 84.63 (80.0, 90.0]
4 30.94 (30.0, 40.0]
In [61]:
plt.figure(figsize=(10,5))
sns.countplot(y=df['NET_LTV_RANGE'])
# We can see that most number of loans has been given to those having NET_LTV_RANGE between 40 to 70.
Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d8c1ed588>
In [62]:
plt.figure(figsize=(10,5))
sns.barplot(y=df[df['FORECLOSURE'] == 1]['NET_LTV_RANGE'].value_counts().index , x=df[df['FORECLOSURE'] == 1]['NET_LTV_RANGE'].value_counts().values)
plt.xlabel('Counts corresponding to FORECLOSURE')
# We can see that most number of foreclosure has happened when NET_LTV_RANGE between 70 to 80.
# Bank should try to give loan at safer side in the range od 30 to 50 . As most of the foreclosure is taking place around 
# 50-80
Out[62]:
Text(0.5, 0, 'Counts corresponding to FORECLOSURE')
In [63]:
plt.figure(figsize=(10,5))
sns.countplot(y=df['NET_LTV_RANGE'],hue=df['FORECLOSURE'])
plt.xlabel(' Respective counts corresponding to FORECLOSURE & NON-FORECLOSURE')
Out[63]:
Text(0.5, 0, ' Respective counts corresponding to FORECLOSURE & NON-FORECLOSURE')
In [64]:
df['SCHEMEID'].value_counts().head()
Out[64]:
10901104.0    2328
10901106.0    1444
10901295.0    1089
10901287.0    1015
10901112.0     989
Name: SCHEMEID, dtype: int64
In [65]:
plt.figure(figsize=(10,5))
sns.barplot(x=df['SCHEMEID'].value_counts().head(5).index , y=df['SCHEMEID'].value_counts().head().values)
plt.xlabel('Most popular scheme')
Out[65]:
Text(0.5, 0, 'Most popular scheme')
In [66]:
plt.figure(figsize=(10,5))
sns.barplot(x=df[df['FORECLOSURE'] == 1]['SCHEMEID'].value_counts(ascending=False).head(5).index , y=df[df['FORECLOSURE'] == 1]['SCHEMEID'].value_counts(ascending=False).head(5).values)
plt.xlabel(' Top 5 Schemes resulting in maximum foreclosure')
Out[66]:
Text(0.5, 0, ' Top 5 Schemes resulting in maximum foreclosure')
In [67]:
plt.figure(figsize=(10,5))
sns.barplot(x=df[df['FORECLOSURE'] == 0]['SCHEMEID'].value_counts(ascending=False).head(5).index , y=df[df['FORECLOSURE'] == 0]['SCHEMEID'].value_counts(ascending=False).head(5).values)
plt.xlabel('Top 5 Schemes having maximum number of non-foreclosure case')
Out[67]:
Text(0.5, 0, 'Top 5 Schemes having maximum number of non-foreclosure case')
In [68]:
df['SCHEMEID'].value_counts(ascending=True).head(48)
#Least popular scheme id
Out[68]:
10901345.0    1
10901453.0    1
10901425.0    1
10901235.0    1
10901140.0    1
10901262.0    1
10901282.0    1
10901409.0    1
10901172.0    1
10901132.0    1
10901224.0    1
10901388.0    1
10901414.0    1
10901193.0    1
10901189.0    1
10901212.0    1
10901144.0    1
10901404.0    1
10901116.0    1
10901168.0    1
10901410.0    1
10901427.0    1
10901382.0    1
10901149.0    1
10901391.0    1
10901260.0    1
10901180.0    1
10901341.0    1
10901205.0    1
10901363.0    1
10901135.0    1
10901418.0    1
10901304.0    1
10901171.0    1
10901385.0    1
10901246.0    1
10901157.0    1
10901455.0    1
10901187.0    1
10901164.0    1
10901208.0    1
10901191.0    1
10901166.0    1
10901228.0    1
10901379.0    1
10901403.0    1
10901227.0    1
10901338.0    1
Name: SCHEMEID, dtype: int64
In [69]:
plt.figure(figsize=(8,5))
sns.barplot(df['MOB'].value_counts(ascending=False).head().index,df['MOB'].value_counts(ascending=False).head())
plt.xlabel('Most popular Internal code')
Out[69]:
Text(0.5, 0, 'Most popular Internal code')
In [70]:
plt.figure(figsize=(8,5))
sns.barplot(x=df[df['FORECLOSURE'] == 1]['MOB'].value_counts(ascending=False).head(5).index , y=df[df['FORECLOSURE'] == 1]['MOB'].value_counts(ascending=False).head(5).values)
plt.xlabel('Top 5 Internal code having maximum number of foreclosure case')
Out[70]:
Text(0.5, 0, 'Top 5 Internal code having maximum number of foreclosure case')
In [71]:
plt.figure(figsize=(8,5))
sns.barplot(x=df[df['FORECLOSURE'] == 0]['MOB'].value_counts(ascending=False).head(5).index , y=df[df['FORECLOSURE'] == 0]['MOB'].value_counts(ascending=False).head(5).values)
plt.xlabel('Top 5 Internal code having maximum number of non-foreclosure case')
Out[71]:
Text(0.5, 0, 'Top 5 Internal code having maximum number of non-foreclosure case')
In [72]:
df['FOIR_Range']=pd.cut(df['FOIR'], [0,0.20,0.40,0.60,0.80,1,5,10,15,20,25,30,35], include_lowest=True)
In [73]:
df.isnull().sum()
Out[73]:
AUTHORIZATIONDATE                      0
BALANCE_EXCESS                         0
BALANCE_TENURE                         0
CITY                                   0
COMPLETED_TENURE                       0
CURRENT_INTEREST_RATE                  0
CURRENT_INTEREST_RATE_MAX              0
CURRENT_INTEREST_RATE_MIN              0
CURRENT_INTEREST_RATE_CHANGES          0
CURRENT_TENOR                          0
CUSTOMERID                             0
DIFF_AUTH_INT_DATE                     0
DIFF_CURRENT_INTEREST_RATE_MAX_MIN     0
DIFF_EMI_AMOUNT_MAX_MIN                0
DIFF_ORIGINAL_CURRENT_INTEREST_RATE    0
DIFF_ORIGINAL_CURRENT_TENOR            0
DPD                                    0
DUEDAY                                 0
EMI_AMOUNT                             0
EMI_DUEAMT                             0
EMI_OS_AMOUNT                          0
EMI_RECEIVED_AMT                       0
EXCESS_ADJUSTED_AMT                    0
EXCESS_AVAILABLE                       0
FOIR                                   0
INTEREST_START_DATE                    0
LAST_RECEIPT_AMOUNT                    0
LAST_RECEIPT_DATE                      0
LATEST_TRANSACTION_MONTH               0
LOAN_AMT                               0
MAX_EMI_AMOUNT                         0
MIN_EMI_AMOUNT                         0
MONTHOPENING                           0
NET_DISBURSED_AMT                      0
NET_LTV                                0
NET_RECEIVABLE                         0
NUM_EMI_CHANGES                        0
NUM_LOW_FREQ_TRANSACTIONS              0
ORIGNAL_INTEREST_RATE                  0
ORIGNAL_TENOR                          0
OUTSTANDING_PRINCIPAL                  0
PAID_INTEREST                          0
PAID_PRINCIPAL                         0
PRE_EMI_DUEAMT                         0
PRE_EMI_OS_AMOUNT                      0
PRE_EMI_RECEIVED_AMT                   0
PRODUCT                                0
SCHEMEID                               0
MOB                                    0
FORECLOSURE                            0
Percentage_Completion                  0
NET_LTV_RANGE                          0
FOIR_Range                             2
dtype: int64
In [74]:
#We can clearly see that these records for FOIR column in incorrectly inserted as FOIR cannot be negative and cannot have
#such high values. Hence we will going to drop these 2 rows.
df[df['FOIR_Range'].isnull()][['FOIR','FOIR_Range']]
Out[74]:
FOIR FOIR_Range
2927 547616.00 NaN
8055 -170.33 NaN
In [75]:
df.dropna(inplace=True)
In [76]:
df[df['FOIR_Range'].isnull()][['FOIR','FOIR_Range']]
Out[76]:
FOIR FOIR_Range
In [77]:
df['FOIR_Range'].value_counts()
Out[77]:
(0.4, 0.6]       8530
(0.2, 0.4]       4168
(0.6, 0.8]       3222
(0.8, 1.0]       1790
(1.0, 5.0]       1092
(-0.001, 0.2]     640
(5.0, 10.0]        32
(10.0, 15.0]       11
(30.0, 35.0]        2
(25.0, 30.0]        1
(20.0, 25.0]        1
(15.0, 20.0]        1
Name: FOIR_Range, dtype: int64
In [78]:
plt.figure(figsize=(8,5))
sns.countplot(y=df['FOIR_Range'])
plt.xlabel('Total counts for a specific range ')
Out[78]:
Text(0.5, 0, 'Total counts for a specific range ')
In [79]:
plt.figure(figsize=(8,5))
sns.barplot(y=df[df['FORECLOSURE'] == 1]['FOIR_Range'].value_counts(ascending=False).head(5).index , x=df[df['FORECLOSURE'] == 1]['FOIR_Range'].value_counts(ascending=False).head(5).values)
plt.xlabel('FORECLOSURE counts for a specific range ')
Out[79]:
Text(0.5, 0, 'FORECLOSURE counts for a specific range ')
In [80]:
FOIR_RATIO = (df[df['FORECLOSURE'] == 1]['FOIR_Range'].value_counts())/(df['FOIR_Range'].value_counts())
FOIR_RATIO
#Creating a new feature comprising of the ratio between FORECLOSURE counts in a specific range to the Total counts 
# in that range
Out[80]:
(-0.001, 0.2]    0.101562
(0.2, 0.4]       0.077735
(0.4, 0.6]       0.090387
(0.6, 0.8]       0.065177
(0.8, 1.0]       0.138547
(1.0, 5.0]       0.108974
(5.0, 10.0]      0.125000
(10.0, 15.0]     0.000000
(15.0, 20.0]     0.000000
(20.0, 25.0]     0.000000
(25.0, 30.0]     0.000000
(30.0, 35.0]     0.000000
Name: FOIR_Range, dtype: float64
In [81]:
plt.figure(figsize=(8,5))
sns.barplot(y=FOIR_RATIO.index,x=FOIR_RATIO.values)
plt.xlabel('Ratio between FORECLOSURE counts in a specific range to the Total counts in that range ')
#We can very well observed that whenever the FOIR ratio exceeded 80 % , the count of foreclosue is drastically increased.
#Moreover in those case when the FOIR ratio is less than 20 percent has also greater count of foreclosure. This may be the 
# case of faking oneself income or defaulting willfully and hence the closure look need to be given upen this.
Out[81]:
Text(0.5, 0, 'Ratio between FORECLOSURE counts in a specific range to the Total counts in that range ')
In [82]:
df.head()
Out[82]:
AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE CURRENT_INTEREST_RATE_MAX CURRENT_INTEREST_RATE_MIN CURRENT_INTEREST_RATE_CHANGES CURRENT_TENOR CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_EMI_AMOUNT_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EMI_RECEIVED_AMT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR INTEREST_START_DATE LAST_RECEIPT_AMOUNT LAST_RECEIPT_DATE LATEST_TRANSACTION_MONTH LOAN_AMT MAX_EMI_AMOUNT MIN_EMI_AMOUNT MONTHOPENING NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_INTEREST_RATE ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PAID_INTEREST PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT PRODUCT SCHEMEID MOB FORECLOSURE Percentage_Completion NET_LTV_RANGE FOIR_Range
0 2010-08-29 0.0 0 MUMBAI 45 13.421934 13.734072 13.421934 1 45 12001000.0 1 0.312138 916441.0 -2.809242 75 0 1 83559.0 8.614898e+06 175477.782641 8.439420e+06 2.135869e+05 2.135869e+05 0.60 2010-08-30 1000000.0 2014-05-05 5.0 1.171011e+07 1000000.0 83559.0 8.693575e+06 1.171011e+07 40.06 175477.782641 1 1 10.612692 120 1.164472e+05 2.725723e+06 1.159366e+07 7994.273589 0.0 7994.273589 HL 10901100.0 45 1 100.000000 (40.0, 50.0] (0.4, 0.6]
1 2010-09-15 0.0 99 MUMBAI 38 13.734072 13.734072 13.734072 0 137 12001001.0 0 0.000000 0.0 -3.121380 43 0 1 126530.0 1.061903e+07 279448.084560 1.033958e+07 0.000000e+00 0.000000e+00 0.60 2010-09-15 126530.0 2013-11-01 11.0 1.929025e+07 126530.0 126530.0 1.713861e+07 1.929025e+07 84.31 279448.084560 0 0 10.612692 180 1.234760e+07 4.945154e+06 6.942656e+06 100504.575864 0.0 100504.575864 HL 10901100.0 38 1 27.737226 (80.0, 90.0] (0.4, 0.6]
2 2010-11-02 0.0 231 MUMBAI 81 15.606900 16.231176 15.606900 2 312 12001005.0 -1 0.624276 477122.0 -4.057794 -132 0 5 22878.0 4.670211e+06 0.000000 4.670211e+06 1.179716e+06 1.179716e+06 0.72 2010-11-01 22878.0 2017-08-05 8.0 3.933395e+06 500000.0 22878.0 3.744831e+06 3.933395e+06 50.89 0.000000 2 3 11.549106 180 2.994184e+06 2.990124e+06 9.392115e+05 5947.630536 0.0 5947.630536 HL 10901101.0 81 1 25.961538 (50.0, 60.0] (0.6, 0.8]
3 2010-10-06 0.0 0 THANE 91 11.236968 13.734072 11.236968 4 91 12001007.0 0 2.497104 1434259.0 -0.624276 89 0 1 65741.0 1.313098e+07 63659.280762 1.306732e+07 6.770848e+06 6.770848e+06 0.60 2010-10-06 65741.0 2018-05-02 5.0 1.002259e+07 1500000.0 65741.0 5.747875e+06 1.002259e+07 84.63 63659.280762 3 8 10.612692 180 4.295120e+04 4.648994e+06 9.979637e+06 60322.180776 0.0 60322.180776 HL 10901100.0 91 1 100.000000 (80.0, 90.0] (0.4, 0.6]
4 2010-10-26 0.0 215 MUMBAI 89 14.982624 15.606900 14.982624 2 304 12001009.0 0 0.624276 0.0 -4.057794 -124 0 5 54433.0 1.048923e+07 0.000000 1.048923e+07 1.202181e+05 1.202181e+05 0.83 2010-10-26 54433.0 2018-04-05 4.0 7.755937e+06 54433.0 54433.0 7.200653e+06 7.755937e+06 30.94 0.000000 0 3 10.924830 180 7.200653e+06 6.593778e+06 5.552847e+05 27732.787464 0.0 27732.787464 HL 10901101.0 90 1 29.276316 (30.0, 40.0] (0.8, 1.0]
In [83]:
sns.countplot(x=df[df['FORECLOSURE'] == 0]['CURRENT_INTEREST_RATE_CHANGES'])
plt.xlabel('Effect of CURRENT_INTEREST_RATE_CHANGES on Non-Foreclosure')
Out[83]:
Text(0.5, 0, 'Effect of CURRENT_INTEREST_RATE_CHANGES on Non-Foreclosure')
In [84]:
sns.countplot(x=df[df['FORECLOSURE'] == 1]['CURRENT_INTEREST_RATE_CHANGES'])
plt.xlabel('Effect of CURRENT_INTEREST_RATE_CHANGES on Foreclosure')
#Current interest rate changes does not have any effect on foreclosure
Out[84]:
Text(0.5, 0, 'Effect of CURRENT_INTEREST_RATE_CHANGES on Foreclosure')
In [85]:
df['Mean_Interest_Rate'] =(df['CURRENT_INTEREST_RATE_MAX'] + df['CURRENT_INTEREST_RATE'] + df['CURRENT_INTEREST_RATE_MIN'])/3
In [86]:
sns.barplot(y=df['Mean_Interest_Rate'],x=df['FORECLOSURE'])
# People getting foreclosed have higher mean interest rate. Although the difference is not that great. But still it can 
# have positive impact on increasing case of foreclosure
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d91f93128>
In [87]:
df.groupby(['FORECLOSURE'])['Mean_Interest_Rate'].mean()
Out[87]:
FORECLOSURE
0    14.636752
1    14.878486
Name: Mean_Interest_Rate, dtype: float64
In [88]:
sns.barplot(y=df['LOAN_AMT'],x=df['FORECLOSURE'])
# People getting foreclosed have higher loan amount sanctioned. Greater the loan amount
# higher the chance of foreclosure
Out[88]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d91ecb748>
In [89]:
sns.barplot(y=df['ORIGNAL_TENOR'],x=df['FORECLOSURE'])
# People having higher original tenor results in higher the chance of foreclosure
Out[89]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d919ad630>
In [90]:
sns.barplot(y=df['COMPLETED_TENURE'],x=df['FORECLOSURE'])
#Person getting foreclosed have larger completed tenure
Out[90]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d90fc9c50>
In [91]:
sns.barplot(y=df['DPD'],x=df['FORECLOSURE'])
plt.ylabel('Days past due')
#Days past due is an important factor for FORECLOSURE. People getting foreclosed have higher units of DPD
Out[91]:
Text(0, 0.5, 'Days past due')
In [92]:
sns.barplot(y=df['DIFF_ORIGINAL_CURRENT_INTEREST_RATE'],x=df['FORECLOSURE'])
# Difference between Original and Current interest rate is insignificant incase of Foreclosure.
Out[92]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d919c77f0>
In [93]:
df.groupby(['FORECLOSURE'])['DIFF_ORIGINAL_CURRENT_INTEREST_RATE'].mean()
Out[93]:
FORECLOSURE
0   -0.427351
1   -0.022951
Name: DIFF_ORIGINAL_CURRENT_INTEREST_RATE, dtype: float64
In [94]:
df['MEAN_EMI_AMOUNT'] =(df['EMI_AMOUNT'] + df['MAX_EMI_AMOUNT'] + df['MIN_EMI_AMOUNT'])/3
In [95]:
sns.barplot(y=df['MEAN_EMI_AMOUNT'],x=df['FORECLOSURE'])
# Person getting foreclosed are paying higher mean_emi_amount.
Out[95]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d911f5a58>
In [96]:
sns.barplot(y=df['DIFF_CURRENT_INTEREST_RATE_MAX_MIN'],x=df['FORECLOSURE'])
Out[96]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d918ac048>
In [97]:
plt.figure(figsize=(12,8))
sns.scatterplot(x='ORIGNAL_TENOR',y='MEAN_EMI_AMOUNT',data=df,hue='FORECLOSURE')
#We can see that most of the foreclosure case is seen in those case when the Original Tenor is exceeds 180 months of time 
# frame . Also in maximum case MEAN_EMI_AMOUNT values lies below 15000000 .
Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d91ad48d0>
In [98]:
print(df[df['ORIGNAL_TENOR'] < 180][df['FORECLOSURE'] == 1].shape)
print(df[df['ORIGNAL_TENOR'] < 190][df['FORECLOSURE'] == 1].shape)
(285, 55)
(1067, 55)
In [99]:
df.head()
Out[99]:
AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE CURRENT_INTEREST_RATE_MAX CURRENT_INTEREST_RATE_MIN CURRENT_INTEREST_RATE_CHANGES CURRENT_TENOR CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_EMI_AMOUNT_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EMI_RECEIVED_AMT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR INTEREST_START_DATE LAST_RECEIPT_AMOUNT LAST_RECEIPT_DATE LATEST_TRANSACTION_MONTH LOAN_AMT MAX_EMI_AMOUNT MIN_EMI_AMOUNT MONTHOPENING NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_INTEREST_RATE ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PAID_INTEREST PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT PRODUCT SCHEMEID MOB FORECLOSURE Percentage_Completion NET_LTV_RANGE FOIR_Range Mean_Interest_Rate MEAN_EMI_AMOUNT
0 2010-08-29 0.0 0 MUMBAI 45 13.421934 13.734072 13.421934 1 45 12001000.0 1 0.312138 916441.0 -2.809242 75 0 1 83559.0 8.614898e+06 175477.782641 8.439420e+06 2.135869e+05 2.135869e+05 0.60 2010-08-30 1000000.0 2014-05-05 5.0 1.171011e+07 1000000.0 83559.0 8.693575e+06 1.171011e+07 40.06 175477.782641 1 1 10.612692 120 1.164472e+05 2.725723e+06 1.159366e+07 7994.273589 0.0 7994.273589 HL 10901100.0 45 1 100.000000 (40.0, 50.0] (0.4, 0.6] 13.525980 389039.333333
1 2010-09-15 0.0 99 MUMBAI 38 13.734072 13.734072 13.734072 0 137 12001001.0 0 0.000000 0.0 -3.121380 43 0 1 126530.0 1.061903e+07 279448.084560 1.033958e+07 0.000000e+00 0.000000e+00 0.60 2010-09-15 126530.0 2013-11-01 11.0 1.929025e+07 126530.0 126530.0 1.713861e+07 1.929025e+07 84.31 279448.084560 0 0 10.612692 180 1.234760e+07 4.945154e+06 6.942656e+06 100504.575864 0.0 100504.575864 HL 10901100.0 38 1 27.737226 (80.0, 90.0] (0.4, 0.6] 13.734072 126530.000000
2 2010-11-02 0.0 231 MUMBAI 81 15.606900 16.231176 15.606900 2 312 12001005.0 -1 0.624276 477122.0 -4.057794 -132 0 5 22878.0 4.670211e+06 0.000000 4.670211e+06 1.179716e+06 1.179716e+06 0.72 2010-11-01 22878.0 2017-08-05 8.0 3.933395e+06 500000.0 22878.0 3.744831e+06 3.933395e+06 50.89 0.000000 2 3 11.549106 180 2.994184e+06 2.990124e+06 9.392115e+05 5947.630536 0.0 5947.630536 HL 10901101.0 81 1 25.961538 (50.0, 60.0] (0.6, 0.8] 15.814992 181918.666667
3 2010-10-06 0.0 0 THANE 91 11.236968 13.734072 11.236968 4 91 12001007.0 0 2.497104 1434259.0 -0.624276 89 0 1 65741.0 1.313098e+07 63659.280762 1.306732e+07 6.770848e+06 6.770848e+06 0.60 2010-10-06 65741.0 2018-05-02 5.0 1.002259e+07 1500000.0 65741.0 5.747875e+06 1.002259e+07 84.63 63659.280762 3 8 10.612692 180 4.295120e+04 4.648994e+06 9.979637e+06 60322.180776 0.0 60322.180776 HL 10901100.0 91 1 100.000000 (80.0, 90.0] (0.4, 0.6] 12.069336 543827.333333
4 2010-10-26 0.0 215 MUMBAI 89 14.982624 15.606900 14.982624 2 304 12001009.0 0 0.624276 0.0 -4.057794 -124 0 5 54433.0 1.048923e+07 0.000000 1.048923e+07 1.202181e+05 1.202181e+05 0.83 2010-10-26 54433.0 2018-04-05 4.0 7.755937e+06 54433.0 54433.0 7.200653e+06 7.755937e+06 30.94 0.000000 0 3 10.924830 180 7.200653e+06 6.593778e+06 5.552847e+05 27732.787464 0.0 27732.787464 HL 10901101.0 90 1 29.276316 (30.0, 40.0] (0.8, 1.0] 15.190716 54433.000000
In [100]:
plt.figure(figsize=(10,5))
sns.barplot(x=df.groupby(df['AUTHORIZATIONDATE'].dt.year)['NET_DISBURSED_AMT'].sum().index,y=df.groupby(df['AUTHORIZATIONDATE'].dt.year)['NET_DISBURSED_AMT'].sum().values)
plt.xlabel('Sum of Net Disbursed Amount each year')
#THere has been a tremendous increase in disbursed loan amount in 2017 and 2018 .A closer look must be given to find out the
# the reason behind it. Such a drastic increase in loan disbursment may be due to increase in giving substandard loan.
Out[100]:
Text(0.5, 0, 'Sum of Net Disbursed Amount each year')
In [101]:
plt.figure(figsize=(10,5))
sns.barplot(x=df.groupby(df['AUTHORIZATIONDATE'].dt.year)['NET_DISBURSED_AMT'].count().index,y=df.groupby(df['AUTHORIZATIONDATE'].dt.year)['NET_DISBURSED_AMT'].count().values)
plt.xlabel(' Count each year')
# The customer counts after 2016 has increased significantly which may be the reason for increase in value of loan disbursed.
Out[101]:
Text(0.5, 0, ' Count each year')
In [102]:
plt.figure(figsize=(10,5))
sns.barplot(x=df.groupby(df['AUTHORIZATIONDATE'].dt.month)['NET_DISBURSED_AMT'].sum().index,y=df.groupby(df['AUTHORIZATIONDATE'].dt.month)['NET_DISBURSED_AMT'].sum().values)
plt.xlabel('Sum of Net Disbursed Amount each year')
# In the month of March the maximum amount of disbursment has taken place.
Out[102]:
Text(0.5, 0, 'Sum of Net Disbursed Amount each year')
In [103]:
plt.figure(figsize=(10,5))
sns.barplot(x=df.groupby(df['AUTHORIZATIONDATE'].dt.month)['NET_DISBURSED_AMT'].count().index,y=df.groupby(df['AUTHORIZATIONDATE'].dt.month)['NET_DISBURSED_AMT'].count().values)
plt.xlabel('Sum of Net Disbursed Amount each year')
# Customer counts goes similar to the sum of the loan disbursed each month.
Out[103]:
Text(0.5, 0, 'Sum of Net Disbursed Amount each year')
In [104]:
plt.figure(figsize=(10,5))
sns.barplot(x=df[df['FORECLOSURE'] == 1].groupby(df['LAST_RECEIPT_DATE'].dt.year)['FORECLOSURE'].count().index,y=df[df['FORECLOSURE'] == 1].groupby(df['LAST_RECEIPT_DATE'].dt.year)['FORECLOSURE'].count().values)
plt.xlabel(' Year performance in case of Forelosure')
# Year 2017 has seen the most number of foreclosure. If we assume this dataset belongs to the companies from India , 
# demonitisation may have played a huge role in increase in foreclosure.
Out[104]:
Text(0.5, 0, ' Year performance in case of Forelosure')
In [105]:
plt.figure(figsize=(10,5))
sns.barplot(x=df[df['FORECLOSURE'] == 1].groupby(df['LAST_RECEIPT_DATE'].dt.month)['FORECLOSURE'].count().index,y=df[df['FORECLOSURE'] == 1].groupby(df['LAST_RECEIPT_DATE'].dt.month)['FORECLOSURE'].count().values)
plt.xlabel(' Year performance in case of Forelosure')
# No visible pattern is seen . Nothing can be inferred in this case.
Out[105]:
Text(0.5, 0, ' Year performance in case of Forelosure')
In [106]:
plt.figure(figsize=(8,5))
sns.barplot(x=df[df['FORECLOSURE'] == 1].groupby(df['LATEST_TRANSACTION_MONTH'])['FORECLOSURE'].count().index,y=df[df['FORECLOSURE'] == 1].groupby(df['LATEST_TRANSACTION_MONTH'])['FORECLOSURE'].count().values)
plt.xlabel(' LATEST TRANSACTION MONTH in case of Forelosure')
Out[106]:
Text(0.5, 0, ' LATEST TRANSACTION MONTH in case of Forelosure')
In [107]:
plt.figure(figsize=(8,5))
sns.barplot(x=df[df['FORECLOSURE'] == 0].groupby(df['LATEST_TRANSACTION_MONTH'])['FORECLOSURE'].count().index,y=df[df['FORECLOSURE'] == 0].groupby(df['LATEST_TRANSACTION_MONTH'])['FORECLOSURE'].count().values)
plt.xlabel(' LATEST TRANSACTION MONTH in case of Non - Forelosure')
Out[107]:
Text(0.5, 0, ' LATEST TRANSACTION MONTH in case of Non - Forelosure')
In [108]:
plt.figure(figsize=(8,5))
sns.countplot(df['LATEST_TRANSACTION_MONTH'] , hue=df['FORECLOSURE'])
plt.xlabel(' LATEST TRANSACTION MONTH in case of Forelosure')
Out[108]:
Text(0.5, 0, ' LATEST TRANSACTION MONTH in case of Forelosure')
In [109]:
df.head()
Out[109]:
AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE CURRENT_INTEREST_RATE_MAX CURRENT_INTEREST_RATE_MIN CURRENT_INTEREST_RATE_CHANGES CURRENT_TENOR CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_EMI_AMOUNT_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EMI_RECEIVED_AMT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR INTEREST_START_DATE LAST_RECEIPT_AMOUNT LAST_RECEIPT_DATE LATEST_TRANSACTION_MONTH LOAN_AMT MAX_EMI_AMOUNT MIN_EMI_AMOUNT MONTHOPENING NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_INTEREST_RATE ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PAID_INTEREST PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT PRODUCT SCHEMEID MOB FORECLOSURE Percentage_Completion NET_LTV_RANGE FOIR_Range Mean_Interest_Rate MEAN_EMI_AMOUNT
0 2010-08-29 0.0 0 MUMBAI 45 13.421934 13.734072 13.421934 1 45 12001000.0 1 0.312138 916441.0 -2.809242 75 0 1 83559.0 8.614898e+06 175477.782641 8.439420e+06 2.135869e+05 2.135869e+05 0.60 2010-08-30 1000000.0 2014-05-05 5.0 1.171011e+07 1000000.0 83559.0 8.693575e+06 1.171011e+07 40.06 175477.782641 1 1 10.612692 120 1.164472e+05 2.725723e+06 1.159366e+07 7994.273589 0.0 7994.273589 HL 10901100.0 45 1 100.000000 (40.0, 50.0] (0.4, 0.6] 13.525980 389039.333333
1 2010-09-15 0.0 99 MUMBAI 38 13.734072 13.734072 13.734072 0 137 12001001.0 0 0.000000 0.0 -3.121380 43 0 1 126530.0 1.061903e+07 279448.084560 1.033958e+07 0.000000e+00 0.000000e+00 0.60 2010-09-15 126530.0 2013-11-01 11.0 1.929025e+07 126530.0 126530.0 1.713861e+07 1.929025e+07 84.31 279448.084560 0 0 10.612692 180 1.234760e+07 4.945154e+06 6.942656e+06 100504.575864 0.0 100504.575864 HL 10901100.0 38 1 27.737226 (80.0, 90.0] (0.4, 0.6] 13.734072 126530.000000
2 2010-11-02 0.0 231 MUMBAI 81 15.606900 16.231176 15.606900 2 312 12001005.0 -1 0.624276 477122.0 -4.057794 -132 0 5 22878.0 4.670211e+06 0.000000 4.670211e+06 1.179716e+06 1.179716e+06 0.72 2010-11-01 22878.0 2017-08-05 8.0 3.933395e+06 500000.0 22878.0 3.744831e+06 3.933395e+06 50.89 0.000000 2 3 11.549106 180 2.994184e+06 2.990124e+06 9.392115e+05 5947.630536 0.0 5947.630536 HL 10901101.0 81 1 25.961538 (50.0, 60.0] (0.6, 0.8] 15.814992 181918.666667
3 2010-10-06 0.0 0 THANE 91 11.236968 13.734072 11.236968 4 91 12001007.0 0 2.497104 1434259.0 -0.624276 89 0 1 65741.0 1.313098e+07 63659.280762 1.306732e+07 6.770848e+06 6.770848e+06 0.60 2010-10-06 65741.0 2018-05-02 5.0 1.002259e+07 1500000.0 65741.0 5.747875e+06 1.002259e+07 84.63 63659.280762 3 8 10.612692 180 4.295120e+04 4.648994e+06 9.979637e+06 60322.180776 0.0 60322.180776 HL 10901100.0 91 1 100.000000 (80.0, 90.0] (0.4, 0.6] 12.069336 543827.333333
4 2010-10-26 0.0 215 MUMBAI 89 14.982624 15.606900 14.982624 2 304 12001009.0 0 0.624276 0.0 -4.057794 -124 0 5 54433.0 1.048923e+07 0.000000 1.048923e+07 1.202181e+05 1.202181e+05 0.83 2010-10-26 54433.0 2018-04-05 4.0 7.755937e+06 54433.0 54433.0 7.200653e+06 7.755937e+06 30.94 0.000000 0 3 10.924830 180 7.200653e+06 6.593778e+06 5.552847e+05 27732.787464 0.0 27732.787464 HL 10901101.0 90 1 29.276316 (30.0, 40.0] (0.8, 1.0] 15.190716 54433.000000

Removal of unwanted variables

In [110]:
df_1=pd.concat([df,pd.get_dummies(df['PRODUCT'])],axis=1)
#One hot encoding for Product Column
In [111]:
df_1.head(5)
Out[111]:
AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE CURRENT_INTEREST_RATE_MAX CURRENT_INTEREST_RATE_MIN CURRENT_INTEREST_RATE_CHANGES CURRENT_TENOR CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_EMI_AMOUNT_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EMI_RECEIVED_AMT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR INTEREST_START_DATE LAST_RECEIPT_AMOUNT LAST_RECEIPT_DATE LATEST_TRANSACTION_MONTH LOAN_AMT MAX_EMI_AMOUNT MIN_EMI_AMOUNT MONTHOPENING NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_INTEREST_RATE ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PAID_INTEREST PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT PRODUCT SCHEMEID MOB FORECLOSURE Percentage_Completion NET_LTV_RANGE FOIR_Range Mean_Interest_Rate MEAN_EMI_AMOUNT HL LAP STHL STLAP
0 2010-08-29 0.0 0 MUMBAI 45 13.421934 13.734072 13.421934 1 45 12001000.0 1 0.312138 916441.0 -2.809242 75 0 1 83559.0 8.614898e+06 175477.782641 8.439420e+06 2.135869e+05 2.135869e+05 0.60 2010-08-30 1000000.0 2014-05-05 5.0 1.171011e+07 1000000.0 83559.0 8.693575e+06 1.171011e+07 40.06 175477.782641 1 1 10.612692 120 1.164472e+05 2.725723e+06 1.159366e+07 7994.273589 0.0 7994.273589 HL 10901100.0 45 1 100.000000 (40.0, 50.0] (0.4, 0.6] 13.525980 389039.333333 1 0 0 0
1 2010-09-15 0.0 99 MUMBAI 38 13.734072 13.734072 13.734072 0 137 12001001.0 0 0.000000 0.0 -3.121380 43 0 1 126530.0 1.061903e+07 279448.084560 1.033958e+07 0.000000e+00 0.000000e+00 0.60 2010-09-15 126530.0 2013-11-01 11.0 1.929025e+07 126530.0 126530.0 1.713861e+07 1.929025e+07 84.31 279448.084560 0 0 10.612692 180 1.234760e+07 4.945154e+06 6.942656e+06 100504.575864 0.0 100504.575864 HL 10901100.0 38 1 27.737226 (80.0, 90.0] (0.4, 0.6] 13.734072 126530.000000 1 0 0 0
2 2010-11-02 0.0 231 MUMBAI 81 15.606900 16.231176 15.606900 2 312 12001005.0 -1 0.624276 477122.0 -4.057794 -132 0 5 22878.0 4.670211e+06 0.000000 4.670211e+06 1.179716e+06 1.179716e+06 0.72 2010-11-01 22878.0 2017-08-05 8.0 3.933395e+06 500000.0 22878.0 3.744831e+06 3.933395e+06 50.89 0.000000 2 3 11.549106 180 2.994184e+06 2.990124e+06 9.392115e+05 5947.630536 0.0 5947.630536 HL 10901101.0 81 1 25.961538 (50.0, 60.0] (0.6, 0.8] 15.814992 181918.666667 1 0 0 0
3 2010-10-06 0.0 0 THANE 91 11.236968 13.734072 11.236968 4 91 12001007.0 0 2.497104 1434259.0 -0.624276 89 0 1 65741.0 1.313098e+07 63659.280762 1.306732e+07 6.770848e+06 6.770848e+06 0.60 2010-10-06 65741.0 2018-05-02 5.0 1.002259e+07 1500000.0 65741.0 5.747875e+06 1.002259e+07 84.63 63659.280762 3 8 10.612692 180 4.295120e+04 4.648994e+06 9.979637e+06 60322.180776 0.0 60322.180776 HL 10901100.0 91 1 100.000000 (80.0, 90.0] (0.4, 0.6] 12.069336 543827.333333 1 0 0 0
4 2010-10-26 0.0 215 MUMBAI 89 14.982624 15.606900 14.982624 2 304 12001009.0 0 0.624276 0.0 -4.057794 -124 0 5 54433.0 1.048923e+07 0.000000 1.048923e+07 1.202181e+05 1.202181e+05 0.83 2010-10-26 54433.0 2018-04-05 4.0 7.755937e+06 54433.0 54433.0 7.200653e+06 7.755937e+06 30.94 0.000000 0 3 10.924830 180 7.200653e+06 6.593778e+06 5.552847e+05 27732.787464 0.0 27732.787464 HL 10901101.0 90 1 29.276316 (30.0, 40.0] (0.8, 1.0] 15.190716 54433.000000 1 0 0 0
In [112]:
df_1.shape
Out[112]:
(19490, 59)
In [113]:
df_1.drop(['PRODUCT'],axis=1,inplace=True)
# We can drop this PRODUCT column as it has already been taken care in form of One hot encoding
In [114]:
df[['BALANCE_EXCESS', 'BALANCE_TENURE','COMPLETED_TENURE', 'CURRENT_INTEREST_RATE', \
'CURRENT_INTEREST_RATE_MAX', 'CURRENT_INTEREST_RATE_MIN','CURRENT_INTEREST_RATE_CHANGES', 'CURRENT_TENOR',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN', 'DIFF_EMI_AMOUNT_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT','EMI_RECEIVED_AMT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', \
'LATEST_TRANSACTION_MONTH', 'LOAN_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'MONTHOPENING', 'NET_DISBURSED_AMT', 'NET_LTV',
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS','ORIGNAL_INTEREST_RATE', 'ORIGNAL_TENOR',\
'OUTSTANDING_PRINCIPAL','PAID_INTEREST', 'PAID_PRINCIPAL', 'PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT', 'PRE_EMI_RECEIVED_AMT','Percentage_Completion',\
 'Mean_Interest_Rate','MEAN_EMI_AMOUNT' ]].corr()
Out[114]:
BALANCE_EXCESS BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE CURRENT_INTEREST_RATE_MAX CURRENT_INTEREST_RATE_MIN CURRENT_INTEREST_RATE_CHANGES CURRENT_TENOR DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_EMI_AMOUNT_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EMI_RECEIVED_AMT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH LOAN_AMT MAX_EMI_AMOUNT MIN_EMI_AMOUNT MONTHOPENING NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_INTEREST_RATE ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PAID_INTEREST PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT
BALANCE_EXCESS 1.000000 -0.012266 0.017507 -0.008641 -0.002871 -0.006122 0.004445 -0.008474 -0.000674 0.009577 0.553876 0.017888 0.001877 0.113120 0.021477 0.099806 0.072246 0.253958 0.049247 0.019297 0.377228 0.011549 0.538527 -0.075756 0.110652 0.555380 0.075860 0.116273 0.111331 0.003119 -0.888265 0.035252 0.032937 -0.002174 -0.009720 0.109934 0.057999 0.002517 0.029929 -0.000058 0.029988 0.011889 -0.005976 0.547501
BALANCE_TENURE -0.012266 1.000000 -0.443806 -0.390425 -0.433979 -0.399386 0.008628 0.967658 -0.018502 -0.007369 -0.075292 -0.236978 -0.641320 -0.056924 -0.202126 -0.128066 -0.212102 -0.022054 -0.212317 -0.138073 -0.132304 -0.018946 -0.056824 0.165859 -0.058728 -0.077676 -0.058840 -0.022549 -0.059304 0.275013 0.001976 -0.199398 -0.264936 -0.452958 0.790460 0.005364 -0.172413 -0.038016 -0.042933 0.008613 -0.043272 -0.662382 -0.413922 -0.091405
COMPLETED_TENURE 0.017507 -0.443806 1.000000 0.246368 0.330673 0.204003 0.363390 -0.203393 0.044587 0.283073 0.040077 0.115659 0.168535 0.145125 0.348767 0.072011 0.328069 0.085400 0.323600 0.079383 0.079823 0.023035 0.025073 -0.117065 0.025710 0.043443 0.078040 -0.004685 0.029593 -0.135041 0.022721 0.304534 0.527518 0.274329 -0.140901 -0.023899 0.345878 0.011306 0.013785 -0.019255 0.014383 0.805790 0.262839 0.053110
CURRENT_INTEREST_RATE -0.008641 -0.390425 0.246368 1.000000 0.975300 0.959370 -0.210919 -0.357221 0.013890 -0.154413 -0.008081 -0.039545 0.066288 0.089529 0.218021 0.001666 0.074271 0.032717 0.072007 0.002098 -0.001162 -0.045499 -0.004939 -0.095418 -0.057505 -0.006290 0.039259 -0.069717 -0.055587 -0.326491 0.024258 0.191205 0.207415 0.940737 -0.419373 -0.072402 0.072696 0.000998 0.000201 0.001177 0.000167 0.263601 0.992676 -0.004160
CURRENT_INTEREST_RATE_MAX -0.002871 -0.433979 0.330673 0.975300 1.000000 0.932378 -0.126192 -0.381076 0.016913 -0.016459 0.007161 0.065267 0.104435 0.093790 0.236355 0.021130 0.118762 0.036289 0.116641 0.019932 0.017431 -0.033860 0.009365 -0.116411 -0.039371 0.009211 0.046202 -0.055576 -0.037187 -0.335824 0.020146 0.202475 0.242590 0.952743 -0.422143 -0.061238 0.120457 0.005794 0.015597 0.002782 0.015545 0.338222 0.983061 0.013194
CURRENT_INTEREST_RATE_MIN -0.006122 -0.399386 0.204003 0.959370 0.932378 1.000000 -0.412287 -0.378938 0.014616 -0.376782 -0.011447 0.137838 0.177686 0.090916 0.230237 -0.050750 0.028926 0.028462 0.026572 -0.005540 -0.007332 -0.078329 -0.008337 -0.162402 -0.106966 -0.009669 0.038705 -0.118305 -0.105759 -0.283690 0.019698 0.207068 0.186323 0.962174 -0.364518 -0.120549 0.027791 -0.000193 -0.007600 0.001290 -0.007653 0.237085 0.979571 -0.013284
CURRENT_INTEREST_RATE_CHANGES 0.004445 0.008628 0.363390 -0.210919 -0.126192 -0.412287 1.000000 0.111724 0.007295 0.817016 0.042089 -0.297098 -0.213100 0.000300 -0.051476 0.165891 0.213275 0.024247 0.213298 0.056978 0.054376 0.140091 0.034086 0.153984 0.175723 0.042345 0.008910 0.167582 0.177899 -0.060001 0.007055 -0.049301 0.138554 -0.302087 -0.012798 0.160117 0.227374 0.014047 0.055568 0.003552 0.055570 0.249569 -0.257841 0.059568
CURRENT_TENOR -0.008474 0.967658 -0.203393 -0.357221 -0.381076 -0.378938 0.111724 1.000000 -0.007663 0.071636 -0.070982 -0.226361 -0.653258 -0.021341 -0.122661 -0.119652 -0.139387 -0.000055 -0.140880 -0.128511 -0.122084 -0.014216 -0.055027 0.148261 -0.056928 -0.072639 -0.042319 -0.025956 -0.056465 0.262462 0.008555 -0.132132 -0.140967 -0.417673 0.823987 -0.000867 -0.091011 -0.038353 -0.043028 0.003990 -0.043230 -0.496879 -0.378258 -0.084917
DIFF_AUTH_INT_DATE -0.000674 -0.018502 0.044587 0.013890 0.016913 0.014616 0.007295 -0.007663 1.000000 0.002912 -0.000494 -0.058302 0.006387 0.001072 0.040863 0.010304 0.028751 0.001242 0.028944 0.003066 0.002598 -0.004453 0.000694 -0.030076 0.008335 0.000684 0.026202 0.006282 0.008422 0.003465 0.001267 -0.008435 0.004335 -0.006534 -0.005281 -0.000331 0.032832 0.003118 0.001711 -0.000261 0.001722 0.062587 0.015357 0.002938
DIFF_CURRENT_INTEREST_RATE_MAX_MIN 0.009577 -0.007369 0.283073 -0.154413 -0.016459 -0.376782 0.817016 0.071636 0.002912 1.000000 0.050012 -0.214013 -0.223864 -0.011136 -0.031178 0.194518 0.224316 0.014264 0.225392 0.066400 0.064948 0.129891 0.047058 0.150899 0.194978 0.050349 0.011336 0.184818 0.197237 -0.075860 -0.002861 -0.053910 0.106265 -0.219971 -0.073483 0.176516 0.231802 0.015381 0.060989 0.003560 0.061001 0.210913 -0.190403 0.070554
DIFF_EMI_AMOUNT_MAX_MIN 0.553876 -0.075292 0.040077 -0.008081 0.007161 -0.011447 0.042089 -0.070982 -0.000494 0.050012 1.000000 0.007818 0.091998 0.042842 0.024342 0.225157 0.198707 0.090856 0.192337 0.569582 0.726637 0.024970 0.837974 -0.074653 0.263836 0.998994 0.054064 0.226453 0.265367 0.011182 -0.515636 0.063107 0.057079 -0.005057 -0.024342 0.145207 0.222011 0.279341 0.267106 -0.000563 0.267639 0.108303 -0.004379 0.986370
DIFF_ORIGINAL_CURRENT_INTEREST_RATE 0.017888 -0.236978 0.115659 -0.039545 0.065267 0.137838 -0.297098 -0.226361 -0.058302 -0.214013 0.007818 1.000000 0.491618 0.001847 0.059911 -0.113521 -0.028071 -0.023844 -0.026141 0.034962 0.038814 -0.082887 0.006964 -0.158614 -0.128522 0.005243 -0.056745 -0.142314 -0.129623 0.029653 -0.029378 0.163893 0.076130 0.301671 0.070727 -0.149978 -0.035430 0.010412 -0.002699 -0.001634 -0.002656 0.145133 0.057475 -0.010100
DIFF_ORIGINAL_CURRENT_TENOR 0.001877 -0.641320 0.168535 0.066288 0.104435 0.177686 -0.213100 -0.653258 0.006387 -0.223864 0.091998 0.491618 1.000000 0.001844 0.080935 -0.023182 0.037404 -0.014533 0.039166 0.188476 0.175259 -0.050280 0.053836 -0.165122 -0.041300 0.091835 0.003407 -0.062817 -0.042507 -0.002953 -0.008775 0.137586 0.116000 0.230106 -0.109276 -0.101522 0.019141 0.048566 0.035751 0.002217 0.035754 0.407600 0.119490 0.085703
DPD 0.113120 -0.056924 0.145125 0.089529 0.093790 0.090916 0.000300 -0.021341 0.001072 -0.011136 0.042842 0.001847 0.001844 1.000000 0.118678 0.014278 0.071100 0.562668 0.019187 0.003710 0.044086 -0.000460 0.041255 -0.076200 0.015202 0.043171 0.010601 0.016705 0.015587 0.018775 0.153881 0.160247 0.301306 0.086051 -0.026637 0.014745 0.028722 -0.000757 0.020967 0.108695 0.017781 0.097332 0.092793 0.043490
DUEDAY 0.021477 -0.202126 0.348767 0.218021 0.236355 0.230237 -0.051476 -0.122661 0.040863 -0.031178 0.024342 0.059911 0.080935 0.118678 1.000000 0.061206 0.131792 0.035603 0.129876 0.020203 0.026432 -0.014030 0.034007 -0.160197 0.045432 0.028047 0.084387 0.033111 0.046006 -0.075109 -0.004627 0.184941 0.179266 0.228360 -0.100468 0.027085 0.134354 0.005843 0.061668 0.016752 0.061290 0.308196 0.231735 0.037398
EMI_AMOUNT 0.099806 -0.128066 0.072011 0.001666 0.021130 -0.050750 0.165891 -0.119652 0.010304 0.194518 0.225157 -0.113521 -0.023182 0.014278 0.061206 1.000000 0.732050 0.074366 0.732957 0.227006 0.246140 0.127163 0.218160 -0.084910 0.920666 0.243135 0.417621 0.893890 0.926277 0.016088 -0.065422 -0.017189 0.043859 -0.036940 -0.174435 0.874496 0.710589 0.090676 0.338529 -0.004782 0.339325 0.132799 -0.010537 0.363125
EMI_DUEAMT 0.072246 -0.212102 0.328069 0.074271 0.118762 0.028926 0.213275 -0.139387 0.028751 0.224316 0.198707 -0.028071 0.037404 0.071100 0.131792 0.732050 1.000000 0.160516 0.995721 0.242289 0.250393 0.099286 0.196213 -0.101498 0.657813 0.217831 0.441112 0.573769 0.662515 0.002967 0.003275 0.049089 0.177813 0.061339 -0.155004 0.535010 0.956779 0.065008 0.261871 -0.004979 0.262525 0.353967 0.073937 0.309905
EMI_OS_AMOUNT 0.253958 -0.022054 0.085400 0.032717 0.036289 0.028462 0.024247 -0.000055 0.001242 0.014264 0.090856 -0.023844 -0.014533 0.562668 0.035603 0.074366 0.160516 1.000000 0.068621 0.018115 0.108041 0.013318 0.100746 -0.029801 0.088152 0.093194 0.059013 0.091590 0.088803 0.015638 0.218619 0.062155 0.139388 0.023124 -0.010948 0.086833 0.087991 0.001883 0.058410 0.012551 0.058150 0.055091 0.032880 0.100306
EMI_RECEIVED_AMT 0.049247 -0.212317 0.323600 0.072007 0.116641 0.026572 0.213298 -0.140880 0.028944 0.225392 0.192337 -0.026141 0.039166 0.019187 0.129876 0.732957 0.995721 0.068621 1.000000 0.243198 0.242969 0.099107 0.188890 -0.099799 0.656631 0.211448 0.440329 0.571362 0.661322 0.001535 -0.017157 0.043798 0.166675 0.059834 -0.155645 0.532632 0.958826 0.065531 0.259218 -0.006208 0.259903 0.352614 0.071653 0.303845
EXCESS_ADJUSTED_AMT 0.019297 -0.138073 0.079383 0.002098 0.019932 -0.005540 0.056978 -0.128511 0.003066 0.066400 0.569582 0.034962 0.188476 0.003710 0.020203 0.227006 0.242289 0.018115 0.243198 1.000000 0.933227 0.024894 0.359752 -0.054875 0.279286 0.570871 0.072256 0.181253 0.280881 0.007330 -0.010885 0.075802 0.082905 0.013868 -0.027669 0.050948 0.263203 0.635590 0.454579 -0.002171 0.455522 0.204762 0.005290 0.576429
EXCESS_AVAILABLE 0.377228 -0.132304 0.079823 -0.001162 0.017431 -0.007332 0.054376 -0.122084 0.002598 0.064948 0.726637 0.038814 0.175259 0.044086 0.026432 0.246140 0.250393 0.108041 0.242969 0.933227 1.000000 0.027209 0.526757 -0.078054 0.298464 0.728372 0.094191 0.209677 0.300186 0.007911 -0.329284 0.082883 0.088630 0.012065 -0.029122 0.086698 0.264645 0.589647 0.431828 -0.002031 0.432723 0.193942 0.002752 0.730689
FOIR 0.011549 -0.018946 0.023035 -0.045499 -0.033860 -0.078329 0.140091 -0.014216 -0.004453 0.129891 0.024970 -0.082887 -0.050280 -0.000460 -0.014030 0.127163 0.099286 0.013318 0.099107 0.024894 0.027209 1.000000 0.024753 -0.067666 0.140873 0.027396 0.055922 0.141859 0.140419 -0.034783 -0.005318 -0.049002 -0.006215 -0.071545 -0.056291 0.139771 0.107431 0.001664 0.042572 0.001271 0.042617 0.033302 -0.054035 0.042917
LAST_RECEIPT_AMOUNT 0.538527 -0.056824 0.025073 -0.004939 0.009365 -0.008337 0.034086 -0.055027 0.000694 0.047058 0.837974 0.006964 0.053836 0.041255 0.034007 0.218160 0.196213 0.100746 0.188890 0.359752 0.526757 0.024753 1.000000 -0.066494 0.219394 0.838276 0.070794 0.218183 0.220587 0.006530 -0.495465 0.025457 0.025656 -0.002349 -0.031954 0.150563 0.211861 0.032483 0.115581 -0.001579 0.115851 0.078607 -0.001509 0.832044
LATEST_TRANSACTION_MONTH -0.075756 0.165859 -0.117065 -0.095418 -0.116411 -0.162402 0.153984 0.148261 -0.030076 0.150899 -0.074653 -0.158614 -0.165122 -0.076200 -0.160197 -0.084910 -0.101498 -0.029801 -0.099799 -0.054875 -0.078054 -0.067666 -0.066494 1.000000 -0.095509 -0.079647 -0.116927 -0.090188 -0.093653 0.081248 0.062115 0.035253 -0.017181 -0.144878 0.071076 -0.079204 -0.108425 -0.005172 -0.046733 -0.019693 -0.046239 -0.154275 -0.127615 -0.090976
LOAN_AMT 0.110652 -0.058728 0.025710 -0.057505 -0.039371 -0.106966 0.175723 -0.056928 0.008335 0.194978 0.263836 -0.128522 -0.041300 0.015202 0.045432 0.920666 0.657813 0.088152 0.656631 0.279286 0.298464 0.140873 0.219394 -0.095509 1.000000 0.280306 0.386985 0.979505 0.998060 0.044555 -0.069607 -0.035345 0.017333 -0.098489 -0.105646 0.960872 0.683321 0.175500 0.418432 0.021183 0.418612 0.059609 -0.069999 0.388616
MAX_EMI_AMOUNT 0.555380 -0.077676 0.043443 -0.006290 0.009211 -0.009669 0.042345 -0.072639 0.000684 0.050349 0.998994 0.005243 0.091835 0.043171 0.028047 0.243135 0.217831 0.093194 0.211448 0.570871 0.728372 0.027396 0.838276 -0.079647 0.280306 1.000000 0.098779 0.241698 0.281912 0.011300 -0.516045 0.057783 0.055313 -0.004223 -0.026639 0.160401 0.239843 0.279584 0.270111 -0.000712 0.270654 0.112637 -0.002481 0.991271
MIN_EMI_AMOUNT 0.075860 -0.058840 0.078040 0.039259 0.046202 0.038705 0.008910 -0.042319 0.026202 0.011336 0.054064 -0.056745 0.003407 0.010601 0.084387 0.417621 0.441112 0.059013 0.440329 0.072256 0.094191 0.055922 0.070794 -0.116927 0.386985 0.098779 1.000000 0.356845 0.388768 0.003476 -0.048543 -0.113747 -0.034966 0.018200 -0.053009 0.349490 0.414127 0.026761 0.087359 -0.003350 0.087628 0.104805 0.041949 0.184581
MONTHOPENING 0.116273 -0.022549 -0.004685 -0.069717 -0.055576 -0.118305 0.167582 -0.025956 0.006282 0.184818 0.226453 -0.142314 -0.062817 0.016705 0.033111 0.893890 0.573769 0.091590 0.571362 0.181253 0.209677 0.141859 0.218183 -0.090188 0.979505 0.241698 0.356845 1.000000 0.980953 0.049561 -0.073628 -0.046332 0.001317 -0.114823 -0.081086 0.985926 0.617770 0.032881 0.327730 0.023388 0.327669 0.010012 -0.083402 0.347267
NET_DISBURSED_AMT 0.111331 -0.059304 0.029593 -0.055587 -0.037187 -0.105759 0.177899 -0.056465 0.008422 0.197237 0.265367 -0.129623 -0.042507 0.015587 0.046006 0.926277 0.662515 0.088803 0.661322 0.280881 0.300186 0.140419 0.220587 -0.093653 0.998060 0.281912 0.388768 0.980953 1.000000 0.043753 -0.069988 -0.034585 0.019244 -0.097033 -0.105941 0.962052 0.687888 0.176389 0.416473 0.020684 0.416664 0.062794 -0.068217 0.390861
NET_LTV 0.003119 0.275013 -0.135041 -0.326491 -0.335824 -0.283690 -0.060001 0.262462 0.003465 -0.075860 0.011182 0.029653 -0.002953 0.018775 -0.075109 0.016088 0.002967 0.015638 0.001535 0.007330 0.007911 -0.034783 0.006530 0.081248 0.044555 0.011300 0.003476 0.049561 0.043753 1.000000 0.004360 -0.025121 -0.044510 -0.301460 0.342366 0.049295 0.011215 0.001098 0.011098 0.010126 0.010819 -0.165085 -0.319327 0.012794
NET_RECEIVABLE -0.888265 0.001976 0.022721 0.024258 0.020146 0.019698 0.007055 0.008555 0.001267 -0.002861 -0.515636 -0.029378 -0.008775 0.153881 -0.004627 -0.065422 0.003275 0.218619 -0.017157 -0.010885 -0.329284 -0.005318 -0.495465 0.062115 -0.069607 -0.516045 -0.048543 -0.073628 -0.069988 0.004360 1.000000 -0.005981 0.033205 0.013175 0.004665 -0.069483 -0.016759 -0.001650 -0.001809 0.014224 -0.002235 0.014031 0.021652 -0.504726
NUM_EMI_CHANGES 0.035252 -0.199398 0.304534 0.191205 0.202475 0.207068 -0.049301 -0.132132 -0.008435 -0.053910 0.063107 0.163893 0.137586 0.160247 0.184941 -0.017189 0.049089 0.062155 0.043798 0.075802 0.082883 -0.049002 0.025457 0.035253 -0.035345 0.057783 -0.113747 -0.046332 -0.034585 -0.025121 -0.005981 1.000000 0.709111 0.238066 -0.070507 -0.058476 0.061807 0.034886 0.106896 0.009181 0.106830 0.246673 0.203479 0.048639
NUM_LOW_FREQ_TRANSACTIONS 0.032937 -0.264936 0.527518 0.207415 0.242590 0.186323 0.138554 -0.140967 0.004335 0.106265 0.057079 0.076130 0.116000 0.301306 0.179266 0.043859 0.177813 0.139388 0.166675 0.082905 0.088630 -0.006215 0.025656 -0.017181 0.017333 0.055313 -0.034966 0.001317 0.019244 -0.044510 0.033205 0.709111 1.000000 0.223746 -0.098260 -0.014082 0.187696 0.020062 0.060041 0.031954 0.059208 0.434577 0.214670 0.056487
ORIGNAL_INTEREST_RATE -0.002174 -0.452958 0.274329 0.940737 0.952743 0.962174 -0.302087 -0.417673 -0.006534 -0.219971 -0.005057 0.301671 0.230106 0.086051 0.228360 -0.036940 0.061339 0.023124 0.059834 0.013868 0.012065 -0.071545 -0.002349 -0.144878 -0.098489 -0.004223 0.018200 -0.114823 -0.097033 -0.301460 0.013175 0.238066 0.223746 1.000000 -0.376143 -0.119986 0.057338 0.004486 -0.000724 0.000568 -0.000742 0.300776 0.966677 -0.007397
ORIGNAL_TENOR -0.009720 0.790460 -0.140901 -0.419373 -0.422143 -0.364518 -0.012798 0.823987 -0.005281 -0.073483 -0.024342 0.070727 -0.109276 -0.026637 -0.100468 -0.174435 -0.155004 -0.010948 -0.155645 -0.027669 -0.029122 -0.056291 -0.031954 0.071076 -0.105646 -0.026639 -0.053009 -0.081086 -0.105941 0.342366 0.004665 -0.070507 -0.098260 -0.376143 1.000000 -0.077113 -0.105160 -0.014008 -0.029735 0.006898 -0.029998 -0.347300 -0.407178 -0.047347
OUTSTANDING_PRINCIPAL 0.109934 0.005364 -0.023899 -0.072402 -0.061238 -0.120549 0.160117 -0.000867 -0.000331 0.176516 0.145207 -0.149978 -0.101522 0.014745 0.027085 0.874496 0.535010 0.086833 0.532632 0.050948 0.086698 0.139771 0.150563 -0.079204 0.960872 0.160401 0.349490 0.985926 0.962052 0.049295 -0.069483 -0.058476 -0.014082 -0.119986 -0.077113 1.000000 0.583613 0.021515 0.317126 0.024343 0.317017 -0.036938 -0.086953 0.266750
PAID_INTEREST 0.057999 -0.172413 0.345878 0.072696 0.120457 0.027791 0.227374 -0.091011 0.032832 0.231802 0.222011 -0.035430 0.019141 0.028722 0.134354 0.710589 0.956779 0.087991 0.958826 0.263203 0.264645 0.107431 0.211861 -0.108425 0.683321 0.239843 0.414127 0.617770 0.687888 0.011215 -0.016759 0.061807 0.187696 0.057338 -0.105160 0.583613 1.000000 0.106170 0.345332 -0.002365 0.346070 0.308841 0.073572 0.327471
PAID_PRINCIPAL 0.002517 -0.038016 0.011306 0.000998 0.005794 -0.000193 0.014047 -0.038353 0.003118 0.015381 0.279341 0.010412 0.048566 -0.000757 0.005843 0.090676 0.065008 0.001883 0.065531 0.635590 0.589647 0.001664 0.032483 -0.005172 0.175500 0.279584 0.026761 0.032881 0.176389 0.001098 -0.001650 0.034886 0.020062 0.004486 -0.014008 0.021515 0.106170 1.000000 0.624601 -0.000560 0.625824 0.029548 0.002170 0.279643
PRE_EMI_DUEAMT 0.029929 -0.042933 0.013785 0.000201 0.015597 -0.007600 0.055568 -0.043028 0.001711 0.060989 0.267106 -0.002699 0.035751 0.020967 0.061668 0.338529 0.261871 0.058410 0.259218 0.454579 0.431828 0.042572 0.115581 -0.046733 0.418432 0.270111 0.087359 0.327730 0.416473 0.011098 -0.001809 0.106896 0.060041 -0.000724 -0.029735 0.317126 0.345332 0.624601 1.000000 0.079875 0.999562 0.025857 0.002503 0.300861
PRE_EMI_OS_AMOUNT -0.000058 0.008613 -0.019255 0.001177 0.002782 0.001290 0.003552 0.003990 -0.000261 0.003560 -0.000563 -0.001634 0.002217 0.108695 0.016752 -0.004782 -0.004979 0.012551 -0.006208 -0.002171 -0.002031 0.001271 -0.001579 -0.019693 0.021183 -0.000712 -0.003350 0.023388 0.020684 0.010126 0.014224 0.009181 0.031954 0.000568 0.006898 0.024343 -0.002365 -0.000560 0.079875 1.000000 0.050353 -0.015595 0.001764 -0.001362
PRE_EMI_RECEIVED_AMT 0.029988 -0.043272 0.014383 0.000167 0.015545 -0.007653 0.055570 -0.043230 0.001722 0.061001 0.267639 -0.002656 0.035754 0.017781 0.061290 0.339325 0.262525 0.058150 0.259903 0.455522 0.432723 0.042617 0.115851 -0.046239 0.418612 0.270654 0.087628 0.327669 0.416664 0.010819 -0.002235 0.106830 0.059208 -0.000742 -0.029998 0.317017 0.346070 0.625824 0.999562 0.050353 1.000000 0.026369 0.002455 0.301483
Percentage_Completion 0.011889 -0.662382 0.805790 0.263601 0.338222 0.237085 0.249569 -0.496879 0.062587 0.210913 0.108303 0.145133 0.407600 0.097332 0.308196 0.132799 0.353967 0.055091 0.352614 0.204762 0.193942 0.033302 0.078607 -0.154275 0.059609 0.112637 0.104805 0.010012 0.062794 -0.165085 0.014031 0.246673 0.434577 0.300776 -0.347300 -0.036938 0.308841 0.029548 0.025857 -0.015595 0.026369 1.000000 0.282781 0.127472
Mean_Interest_Rate -0.005976 -0.413922 0.262839 0.992676 0.983061 0.979571 -0.257841 -0.378258 0.015357 -0.190403 -0.004379 0.057475 0.119490 0.092793 0.231735 -0.010537 0.073937 0.032880 0.071653 0.005290 0.002752 -0.054035 -0.001509 -0.127615 -0.069999 -0.002481 0.041949 -0.083402 -0.068217 -0.319327 0.021652 0.203479 0.214670 0.966677 -0.407178 -0.086953 0.073572 0.002170 0.002503 0.001764 0.002455 0.282781 1.000000 -0.001751
MEAN_EMI_AMOUNT 0.547501 -0.091405 0.053110 -0.004160 0.013194 -0.013284 0.059568 -0.084917 0.002938 0.070554 0.986370 -0.010100 0.085703 0.043490 0.037398 0.363125 0.309905 0.100306 0.303845 0.576429 0.730689 0.042917 0.832044 -0.090976 0.388616 0.991271 0.184581 0.347267 0.390861 0.012794 -0.504726 0.048639 0.056487 -0.007397 -0.047347 0.266750 0.327471 0.279643 0.300861 -0.001362 0.301483 0.127472 -0.001751 1.000000
In [115]:
plt.figure(figsize=(20,20))
sns.heatmap(data=df[['BALANCE_EXCESS', 'BALANCE_TENURE','COMPLETED_TENURE', 'CURRENT_INTEREST_RATE', \
'CURRENT_INTEREST_RATE_MAX', 'CURRENT_INTEREST_RATE_MIN','CURRENT_INTEREST_RATE_CHANGES', 'CURRENT_TENOR',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN', 'DIFF_EMI_AMOUNT_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT','EMI_RECEIVED_AMT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', \
'LATEST_TRANSACTION_MONTH', 'LOAN_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'MONTHOPENING', 'NET_DISBURSED_AMT', 'NET_LTV',
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS','ORIGNAL_INTEREST_RATE', 'ORIGNAL_TENOR',\
'OUTSTANDING_PRINCIPAL','PAID_INTEREST', 'PAID_PRINCIPAL', 'PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT', 'PRE_EMI_RECEIVED_AMT','Percentage_Completion',\
 'Mean_Interest_Rate','MEAN_EMI_AMOUNT']].corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})

# Multicollinearity is present in lot of variables . Hence we will have to intelligently drop some of the highly correlated
# variables.
Out[115]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d91416b38>
In [116]:
df_1.head()
Out[116]:
AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE CURRENT_INTEREST_RATE_MAX CURRENT_INTEREST_RATE_MIN CURRENT_INTEREST_RATE_CHANGES CURRENT_TENOR CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_EMI_AMOUNT_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EMI_RECEIVED_AMT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR INTEREST_START_DATE LAST_RECEIPT_AMOUNT LAST_RECEIPT_DATE LATEST_TRANSACTION_MONTH LOAN_AMT MAX_EMI_AMOUNT MIN_EMI_AMOUNT MONTHOPENING NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_INTEREST_RATE ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PAID_INTEREST PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT SCHEMEID MOB FORECLOSURE Percentage_Completion NET_LTV_RANGE FOIR_Range Mean_Interest_Rate MEAN_EMI_AMOUNT HL LAP STHL STLAP
0 2010-08-29 0.0 0 MUMBAI 45 13.421934 13.734072 13.421934 1 45 12001000.0 1 0.312138 916441.0 -2.809242 75 0 1 83559.0 8.614898e+06 175477.782641 8.439420e+06 2.135869e+05 2.135869e+05 0.60 2010-08-30 1000000.0 2014-05-05 5.0 1.171011e+07 1000000.0 83559.0 8.693575e+06 1.171011e+07 40.06 175477.782641 1 1 10.612692 120 1.164472e+05 2.725723e+06 1.159366e+07 7994.273589 0.0 7994.273589 10901100.0 45 1 100.000000 (40.0, 50.0] (0.4, 0.6] 13.525980 389039.333333 1 0 0 0
1 2010-09-15 0.0 99 MUMBAI 38 13.734072 13.734072 13.734072 0 137 12001001.0 0 0.000000 0.0 -3.121380 43 0 1 126530.0 1.061903e+07 279448.084560 1.033958e+07 0.000000e+00 0.000000e+00 0.60 2010-09-15 126530.0 2013-11-01 11.0 1.929025e+07 126530.0 126530.0 1.713861e+07 1.929025e+07 84.31 279448.084560 0 0 10.612692 180 1.234760e+07 4.945154e+06 6.942656e+06 100504.575864 0.0 100504.575864 10901100.0 38 1 27.737226 (80.0, 90.0] (0.4, 0.6] 13.734072 126530.000000 1 0 0 0
2 2010-11-02 0.0 231 MUMBAI 81 15.606900 16.231176 15.606900 2 312 12001005.0 -1 0.624276 477122.0 -4.057794 -132 0 5 22878.0 4.670211e+06 0.000000 4.670211e+06 1.179716e+06 1.179716e+06 0.72 2010-11-01 22878.0 2017-08-05 8.0 3.933395e+06 500000.0 22878.0 3.744831e+06 3.933395e+06 50.89 0.000000 2 3 11.549106 180 2.994184e+06 2.990124e+06 9.392115e+05 5947.630536 0.0 5947.630536 10901101.0 81 1 25.961538 (50.0, 60.0] (0.6, 0.8] 15.814992 181918.666667 1 0 0 0
3 2010-10-06 0.0 0 THANE 91 11.236968 13.734072 11.236968 4 91 12001007.0 0 2.497104 1434259.0 -0.624276 89 0 1 65741.0 1.313098e+07 63659.280762 1.306732e+07 6.770848e+06 6.770848e+06 0.60 2010-10-06 65741.0 2018-05-02 5.0 1.002259e+07 1500000.0 65741.0 5.747875e+06 1.002259e+07 84.63 63659.280762 3 8 10.612692 180 4.295120e+04 4.648994e+06 9.979637e+06 60322.180776 0.0 60322.180776 10901100.0 91 1 100.000000 (80.0, 90.0] (0.4, 0.6] 12.069336 543827.333333 1 0 0 0
4 2010-10-26 0.0 215 MUMBAI 89 14.982624 15.606900 14.982624 2 304 12001009.0 0 0.624276 0.0 -4.057794 -124 0 5 54433.0 1.048923e+07 0.000000 1.048923e+07 1.202181e+05 1.202181e+05 0.83 2010-10-26 54433.0 2018-04-05 4.0 7.755937e+06 54433.0 54433.0 7.200653e+06 7.755937e+06 30.94 0.000000 0 3 10.924830 180 7.200653e+06 6.593778e+06 5.552847e+05 27732.787464 0.0 27732.787464 10901101.0 90 1 29.276316 (30.0, 40.0] (0.8, 1.0] 15.190716 54433.000000 1 0 0 0

Establishing relationship between different columns . Below are the columns which are derived from the other columns. This information will help us in performing feature engineering.

PRE_EMI_DUEAMT= PRE_EMI_RECEIVED_AMT+PRE_EMI_OS_AMOUNT

OUTSTANDING_PRINCIPAL+ PAID_PRINCIPAL = LOAN_AMT

BALANCE_TENURE + COMPLETED_TENURE = CURRENT_TENOR

ORIGNAL_TENOR - CURRENT_TENOR = DIFF_ORIGINAL_CURRENT_TENOR

EMI_OS_AMOUNT - BALANCE_EXCESS = NET_RECEIVABLE

PRE_EMI_DUEAMT - PRE_EMI_OS_AMOUNT = PRE_EMI_RECEIVED_AMT

INTEREST_START_DATE - AUTHORIZATIONDATE = DIFF_AUTH_INT_DATE

CURRENT_INTEREST_RATE_MAX - CURRENT_INTEREST_RATE_MIN = DIFF_CURRENT_INTEREST_RATE_MAX_MIN

MAX_EMI_AMOUNT - MIN_EMI_AMOUNT = DIFF_EMI_AMOUNT_MAX_MIN

ORIGNAL_INTEREST_RATE - CURRENT_INTEREST_RATE = DIFF_ORIGINAL_CURRENT_INTEREST_RATE

BALANCE_TENURE,COMPLETED_TENURE,CURRENT_TENOR,DIFF_ORIGINAL_CURRENT_TENOR,ORIGNAL_TENOR

Mean_Interest_Rate = CURRENT_INTEREST_RATE_MAX + CURRENT_INTEREST_RATE + CURRENT_INTEREST_RATE_MIN

In [117]:
# As Mean_Interest_Rate column is derived from CURRENT_INTEREST_RATE_MAX,CURRENT_INTEREST_RATE,CURRENT_INTEREST_RATE_MIN 
# column , we can drop these 3 columns with respect to feature engineering.
In [118]:
df_1.drop(['CURRENT_INTEREST_RATE','CURRENT_INTEREST_RATE_MAX','CURRENT_INTEREST_RATE_MIN'],axis=1,inplace=True)
In [119]:
df_1.head()
Out[119]:
AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES CURRENT_TENOR CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_EMI_AMOUNT_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EMI_RECEIVED_AMT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR INTEREST_START_DATE LAST_RECEIPT_AMOUNT LAST_RECEIPT_DATE LATEST_TRANSACTION_MONTH LOAN_AMT MAX_EMI_AMOUNT MIN_EMI_AMOUNT MONTHOPENING NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_INTEREST_RATE ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PAID_INTEREST PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT PRE_EMI_RECEIVED_AMT SCHEMEID MOB FORECLOSURE Percentage_Completion NET_LTV_RANGE FOIR_Range Mean_Interest_Rate MEAN_EMI_AMOUNT HL LAP STHL STLAP
0 2010-08-29 0.0 0 MUMBAI 45 1 45 12001000.0 1 0.312138 916441.0 -2.809242 75 0 1 83559.0 8.614898e+06 175477.782641 8.439420e+06 2.135869e+05 2.135869e+05 0.60 2010-08-30 1000000.0 2014-05-05 5.0 1.171011e+07 1000000.0 83559.0 8.693575e+06 1.171011e+07 40.06 175477.782641 1 1 10.612692 120 1.164472e+05 2.725723e+06 1.159366e+07 7994.273589 0.0 7994.273589 10901100.0 45 1 100.000000 (40.0, 50.0] (0.4, 0.6] 13.525980 389039.333333 1 0 0 0
1 2010-09-15 0.0 99 MUMBAI 38 0 137 12001001.0 0 0.000000 0.0 -3.121380 43 0 1 126530.0 1.061903e+07 279448.084560 1.033958e+07 0.000000e+00 0.000000e+00 0.60 2010-09-15 126530.0 2013-11-01 11.0 1.929025e+07 126530.0 126530.0 1.713861e+07 1.929025e+07 84.31 279448.084560 0 0 10.612692 180 1.234760e+07 4.945154e+06 6.942656e+06 100504.575864 0.0 100504.575864 10901100.0 38 1 27.737226 (80.0, 90.0] (0.4, 0.6] 13.734072 126530.000000 1 0 0 0
2 2010-11-02 0.0 231 MUMBAI 81 2 312 12001005.0 -1 0.624276 477122.0 -4.057794 -132 0 5 22878.0 4.670211e+06 0.000000 4.670211e+06 1.179716e+06 1.179716e+06 0.72 2010-11-01 22878.0 2017-08-05 8.0 3.933395e+06 500000.0 22878.0 3.744831e+06 3.933395e+06 50.89 0.000000 2 3 11.549106 180 2.994184e+06 2.990124e+06 9.392115e+05 5947.630536 0.0 5947.630536 10901101.0 81 1 25.961538 (50.0, 60.0] (0.6, 0.8] 15.814992 181918.666667 1 0 0 0
3 2010-10-06 0.0 0 THANE 91 4 91 12001007.0 0 2.497104 1434259.0 -0.624276 89 0 1 65741.0 1.313098e+07 63659.280762 1.306732e+07 6.770848e+06 6.770848e+06 0.60 2010-10-06 65741.0 2018-05-02 5.0 1.002259e+07 1500000.0 65741.0 5.747875e+06 1.002259e+07 84.63 63659.280762 3 8 10.612692 180 4.295120e+04 4.648994e+06 9.979637e+06 60322.180776 0.0 60322.180776 10901100.0 91 1 100.000000 (80.0, 90.0] (0.4, 0.6] 12.069336 543827.333333 1 0 0 0
4 2010-10-26 0.0 215 MUMBAI 89 2 304 12001009.0 0 0.624276 0.0 -4.057794 -124 0 5 54433.0 1.048923e+07 0.000000 1.048923e+07 1.202181e+05 1.202181e+05 0.83 2010-10-26 54433.0 2018-04-05 4.0 7.755937e+06 54433.0 54433.0 7.200653e+06 7.755937e+06 30.94 0.000000 0 3 10.924830 180 7.200653e+06 6.593778e+06 5.552847e+05 27732.787464 0.0 27732.787464 10901101.0 90 1 29.276316 (30.0, 40.0] (0.8, 1.0] 15.190716 54433.000000 1 0 0 0
In [120]:
df[['BALANCE_TENURE','COMPLETED_TENURE','CURRENT_TENOR','ORIGNAL_TENOR']].head()
# We can see that BALANCE_TENURE,COMPLETED_TENURE,CURRENT_TENOR are related with each other. Moreover Balance Tenur
# and Current Tenore are highly correlated with each other. Hence we will going to drop 1 column .Here I am dropping Current
# TENOR
Out[120]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_TENOR ORIGNAL_TENOR
0 0 45 45 120
1 99 38 137 180
2 231 81 312 180
3 0 91 91 180
4 215 89 304 180
In [121]:
df_1.drop(['CURRENT_TENOR'],axis=1,inplace=True)
In [122]:
plt.figure(figsize=(20,20))
sns.heatmap(data=df_1[['BALANCE_EXCESS', 'BALANCE_TENURE','COMPLETED_TENURE','CURRENT_INTEREST_RATE_CHANGES','Mean_Interest_Rate',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN', 'DIFF_EMI_AMOUNT_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT','EMI_RECEIVED_AMT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', \
'LATEST_TRANSACTION_MONTH', 'LOAN_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'MONTHOPENING', 'NET_DISBURSED_AMT', 'NET_LTV',
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS','ORIGNAL_INTEREST_RATE', 'ORIGNAL_TENOR',\
'OUTSTANDING_PRINCIPAL','PAID_INTEREST', 'PAID_PRINCIPAL', 'PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT', 'PRE_EMI_RECEIVED_AMT']].corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})
Out[122]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d9161bba8>
In [123]:
df_1.drop(['ORIGNAL_INTEREST_RATE'],axis=1,inplace=True)
#As the ORIGNAL_INTEREST_RATE is nicely captured by  Mean_Interest_Rate , hence we will be dropping ORIGNAL_INTEREST_RATE.
In [124]:
df_1.drop(['DIFF_EMI_AMOUNT_MAX_MIN'],axis=1,inplace=True)
# DIFF_EMI_AMOUNT_MAX_MIN is highly correlated with MAX_EMI_AMOUNT.Hence we will be dropping DIFF_EMI_AMOUNT_MAX_MIN column.
In [125]:
plt.figure(figsize=(20,20))
sns.heatmap(data=df_1[['BALANCE_EXCESS', 'BALANCE_TENURE','COMPLETED_TENURE','CURRENT_INTEREST_RATE_CHANGES','Mean_Interest_Rate',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT','EMI_RECEIVED_AMT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', \
'LATEST_TRANSACTION_MONTH', 'LOAN_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'MONTHOPENING', 'NET_DISBURSED_AMT', 'NET_LTV',
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS', 'ORIGNAL_TENOR',\
'OUTSTANDING_PRINCIPAL','PAID_INTEREST', 'PAID_PRINCIPAL', 'PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT']].corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})
Out[125]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d9be33dd8>
In [126]:
df_1[(df_1['PRE_EMI_DUEAMT'] - df_1['PRE_EMI_RECEIVED_AMT'])>0][['PRE_EMI_DUEAMT','PRE_EMI_RECEIVED_AMT']].shape
Out[126]:
(47, 2)
In [127]:
df_1.drop(['EMI_RECEIVED_AMT'],axis=1,inplace=True)
# PRE_EMI_DUEAMT is somewhat equal to PRE_EMI_RECEIVED_AMT .So both the variables are highly correlated. Moreover from
# business point of view if PRE_EMI_RECEIVED_AMT should be same as that of PRE_EMI_DUEAMT.If not person may start defaulting

# Hence we will going to drop EMI_RECEIVED_AMT column
In [128]:
df_1.drop(['PRE_EMI_RECEIVED_AMT'],axis=1,inplace=True)
# Similarly PRE_EMI_RECEIVED_AMT is highly correlated PRE_EMI_DUEAMT . Hence we will be dropping PRE_EMI_DUEAMT column
In [129]:
plt.figure(figsize=(20,20))
sns.heatmap(data=df_1[['BALANCE_EXCESS', 'BALANCE_TENURE','COMPLETED_TENURE','CURRENT_INTEREST_RATE_CHANGES','Mean_Interest_Rate',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', \
'LATEST_TRANSACTION_MONTH', 'LOAN_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'MONTHOPENING', 'NET_DISBURSED_AMT', 'NET_LTV',
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS', 'ORIGNAL_TENOR',\
'OUTSTANDING_PRINCIPAL','PAID_INTEREST', 'PAID_PRINCIPAL', 'PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT']].corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})
Out[129]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d9ccd5d68>
In [130]:
df_1.head()
Out[130]:
AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR INTEREST_START_DATE LAST_RECEIPT_AMOUNT LAST_RECEIPT_DATE LATEST_TRANSACTION_MONTH LOAN_AMT MAX_EMI_AMOUNT MIN_EMI_AMOUNT MONTHOPENING NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PAID_INTEREST PAID_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT SCHEMEID MOB FORECLOSURE Percentage_Completion NET_LTV_RANGE FOIR_Range Mean_Interest_Rate MEAN_EMI_AMOUNT HL LAP STHL STLAP
0 2010-08-29 0.0 0 MUMBAI 45 1 12001000.0 1 0.312138 -2.809242 75 0 1 83559.0 8.614898e+06 175477.782641 2.135869e+05 2.135869e+05 0.60 2010-08-30 1000000.0 2014-05-05 5.0 1.171011e+07 1000000.0 83559.0 8.693575e+06 1.171011e+07 40.06 175477.782641 1 1 120 1.164472e+05 2.725723e+06 1.159366e+07 7994.273589 0.0 10901100.0 45 1 100.000000 (40.0, 50.0] (0.4, 0.6] 13.525980 389039.333333 1 0 0 0
1 2010-09-15 0.0 99 MUMBAI 38 0 12001001.0 0 0.000000 -3.121380 43 0 1 126530.0 1.061903e+07 279448.084560 0.000000e+00 0.000000e+00 0.60 2010-09-15 126530.0 2013-11-01 11.0 1.929025e+07 126530.0 126530.0 1.713861e+07 1.929025e+07 84.31 279448.084560 0 0 180 1.234760e+07 4.945154e+06 6.942656e+06 100504.575864 0.0 10901100.0 38 1 27.737226 (80.0, 90.0] (0.4, 0.6] 13.734072 126530.000000 1 0 0 0
2 2010-11-02 0.0 231 MUMBAI 81 2 12001005.0 -1 0.624276 -4.057794 -132 0 5 22878.0 4.670211e+06 0.000000 1.179716e+06 1.179716e+06 0.72 2010-11-01 22878.0 2017-08-05 8.0 3.933395e+06 500000.0 22878.0 3.744831e+06 3.933395e+06 50.89 0.000000 2 3 180 2.994184e+06 2.990124e+06 9.392115e+05 5947.630536 0.0 10901101.0 81 1 25.961538 (50.0, 60.0] (0.6, 0.8] 15.814992 181918.666667 1 0 0 0
3 2010-10-06 0.0 0 THANE 91 4 12001007.0 0 2.497104 -0.624276 89 0 1 65741.0 1.313098e+07 63659.280762 6.770848e+06 6.770848e+06 0.60 2010-10-06 65741.0 2018-05-02 5.0 1.002259e+07 1500000.0 65741.0 5.747875e+06 1.002259e+07 84.63 63659.280762 3 8 180 4.295120e+04 4.648994e+06 9.979637e+06 60322.180776 0.0 10901100.0 91 1 100.000000 (80.0, 90.0] (0.4, 0.6] 12.069336 543827.333333 1 0 0 0
4 2010-10-26 0.0 215 MUMBAI 89 2 12001009.0 0 0.624276 -4.057794 -124 0 5 54433.0 1.048923e+07 0.000000 1.202181e+05 1.202181e+05 0.83 2010-10-26 54433.0 2018-04-05 4.0 7.755937e+06 54433.0 54433.0 7.200653e+06 7.755937e+06 30.94 0.000000 0 3 180 7.200653e+06 6.593778e+06 5.552847e+05 27732.787464 0.0 10901101.0 90 1 29.276316 (30.0, 40.0] (0.8, 1.0] 15.190716 54433.000000 1 0 0 0
In [131]:
sns.countplot(df_1['LAP'])
Out[131]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d9ccd5be0>
In [132]:
sns.countplot(df_1['STHL'])
Out[132]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d9dc05cf8>
In [133]:
df_1['PAID_AMOUNT']  =df_1['PAID_PRINCIPAL'] + df_1['PAID_INTEREST']
# We can add PAID_PRINCIPAL and PAID_INTEREST column to get new column ,PAID_AMOUNT. This can help us to drop first 2 column
# used.

Establishing relationship between different columns . Below are the columns which are derived from the other columns. This information will help us in performing feature engineering.

PRE_EMI_DUEAMT= PRE_EMI_RECEIVED_AMT ## +PRE_EMI_OS_AMOUNT

OUTSTANDING_PRINCIPAL+ PAID_PRINCIPAL ## = LOAN_AMT

BALANCE_TENURE + COMPLETED_TENURE = ## CURRENT_TENOR

ORIGNAL_TENOR - CURRENT_TENOR = DIFF_ORIGINAL_CURRENT_TENOR

EMI_OS_AMOUNT - BALANCE_EXCESS = NET_RECEIVABLE

PRE_EMI_DUEAMT - PRE_EMI_OS_AMOUNT = PRE_EMI_RECEIVED_AMT

INTEREST_START_DATE - AUTHORIZATIONDATE = DIFF_AUTH_INT_DATE

CURRENT_INTEREST_RATE_MAX - CURRENT_INTEREST_RATE_MIN = DIFF_CURRENT_INTEREST_RATE_MAX_MIN

MAX_EMI_AMOUNT - MIN_EMI_AMOUNT = ##DIFF_EMI_AMOUNT_MAX_MIN

ORIGNAL_INTEREST_RATE ## - CURRENT_INTEREST_RATE = DIFF_ORIGINAL_CURRENT_INTEREST_RATE

BALANCE_TENURE,COMPLETED_TENURE,CURRENT_TENOR,DIFF_ORIGINAL_CURRENT_TENOR,ORIGNAL_TENOR

Mean_Interest_Rate = #CURRENT_INTEREST_RATE_MAX + CURRENT_INTEREST_RATE + CURRENT_INTEREST_RATE_MIN

'CURRENT_INTEREST_RATE','CURRENT_INTEREST_RATE_MAX','CURRENT_INTEREST_RATE_MIN'

In [134]:
df_1.drop(['PAID_PRINCIPAL','PAID_INTEREST'],axis=1,inplace=True)
In [135]:
df[df['LOAN_AMT'] -df['NET_DISBURSED_AMT'] > 0][['LOAN_AMT','NET_DISBURSED_AMT']].shape
Out[135]:
(554, 2)
In [136]:
df_1.drop(['LOAN_AMT'],axis=1,inplace=True)  #Net disbursed amount is somewhat same as that of loan_amt . As we know the 
# all the interest calculation is done on the NET_DISBURSED_AMT and not on LOAN_AMOUNT . Hence for calculation and model
# development purpose , we will keep NET_DISBURSED_AMT and drop LOAN_AMT
In [137]:
plt.figure(figsize=(20,20))
sns.heatmap(data=df_1[['BALANCE_EXCESS', 'BALANCE_TENURE','COMPLETED_TENURE','CURRENT_INTEREST_RATE_CHANGES','Mean_Interest_Rate',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', \
'LATEST_TRANSACTION_MONTH', 'NET_DISBURSED_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'MONTHOPENING', 'NET_LTV',
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS', 'ORIGNAL_TENOR',\
'OUTSTANDING_PRINCIPAL','PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT','PAID_AMOUNT']].corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})

# Still there are multicollinearity present for certain variables.
Out[137]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d9daf31d0>
In [138]:
df_2=df_1.copy()
# Taking a copy of df_1
In [139]:
df_2.drop(['MONTHOPENING'],axis=1,inplace=True)
# MONTHOPENING is not usefull for our analysis. Hence we will be dropping it
In [140]:
plt.figure(figsize=(20,20))
sns.heatmap(data=df_2[['BALANCE_EXCESS','BALANCE_TENURE','COMPLETED_TENURE','CURRENT_INTEREST_RATE_CHANGES','Mean_Interest_Rate',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', 'OUTSTANDING_PRINCIPAL',\
'LATEST_TRANSACTION_MONTH', 'NET_DISBURSED_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'NET_LTV',\
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS', 'ORIGNAL_TENOR','PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT','PAID_AMOUNT']].corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})

# There are still certain variables which we will have to drop as multicollinearity is still present
Out[140]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d9ee6d5f8>
In [141]:
# PRE_EMI_OS_AMOUNT 6.928615e+14
# EMI_RECEIVED_AMT 2.144571e+14
In [142]:
df_2.shape
Out[142]:
(19490, 47)

Checking of multicollinearity with VIF

In [143]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
In [144]:
df_2=add_constant(df_2)
In [145]:
df_2.head(5)
Out[145]:
const AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR INTEREST_START_DATE LAST_RECEIPT_AMOUNT LAST_RECEIPT_DATE LATEST_TRANSACTION_MONTH MAX_EMI_AMOUNT MIN_EMI_AMOUNT NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT SCHEMEID MOB FORECLOSURE Percentage_Completion NET_LTV_RANGE FOIR_Range Mean_Interest_Rate MEAN_EMI_AMOUNT HL LAP STHL STLAP PAID_AMOUNT
0 1.0 2010-08-29 0.0 0 MUMBAI 45 1 12001000.0 1 0.312138 -2.809242 75 0 1 83559.0 8.614898e+06 175477.782641 2.135869e+05 2.135869e+05 0.60 2010-08-30 1000000.0 2014-05-05 5.0 1000000.0 83559.0 1.171011e+07 40.06 175477.782641 1 1 120 1.164472e+05 7994.273589 0.0 10901100.0 45 1 100.000000 (40.0, 50.0] (0.4, 0.6] 13.525980 389039.333333 1 0 0 0 1.431938e+07
1 1.0 2010-09-15 0.0 99 MUMBAI 38 0 12001001.0 0 0.000000 -3.121380 43 0 1 126530.0 1.061903e+07 279448.084560 0.000000e+00 0.000000e+00 0.60 2010-09-15 126530.0 2013-11-01 11.0 126530.0 126530.0 1.929025e+07 84.31 279448.084560 0 0 180 1.234760e+07 100504.575864 0.0 10901100.0 38 1 27.737226 (80.0, 90.0] (0.4, 0.6] 13.734072 126530.000000 1 0 0 0 1.188781e+07
2 1.0 2010-11-02 0.0 231 MUMBAI 81 2 12001005.0 -1 0.624276 -4.057794 -132 0 5 22878.0 4.670211e+06 0.000000 1.179716e+06 1.179716e+06 0.72 2010-11-01 22878.0 2017-08-05 8.0 500000.0 22878.0 3.933395e+06 50.89 0.000000 2 3 180 2.994184e+06 5947.630536 0.0 10901101.0 81 1 25.961538 (50.0, 60.0] (0.6, 0.8] 15.814992 181918.666667 1 0 0 0 3.929335e+06
3 1.0 2010-10-06 0.0 0 THANE 91 4 12001007.0 0 2.497104 -0.624276 89 0 1 65741.0 1.313098e+07 63659.280762 6.770848e+06 6.770848e+06 0.60 2010-10-06 65741.0 2018-05-02 5.0 1500000.0 65741.0 1.002259e+07 84.63 63659.280762 3 8 180 4.295120e+04 60322.180776 0.0 10901100.0 91 1 100.000000 (80.0, 90.0] (0.4, 0.6] 12.069336 543827.333333 1 0 0 0 1.462863e+07
4 1.0 2010-10-26 0.0 215 MUMBAI 89 2 12001009.0 0 0.624276 -4.057794 -124 0 5 54433.0 1.048923e+07 0.000000 1.202181e+05 1.202181e+05 0.83 2010-10-26 54433.0 2018-04-05 4.0 54433.0 54433.0 7.755937e+06 30.94 0.000000 0 3 180 7.200653e+06 27732.787464 0.0 10901101.0 90 1 29.276316 (30.0, 40.0] (0.8, 1.0] 15.190716 54433.000000 1 0 0 0 7.149063e+06
In [146]:
df_2[df_2['LAST_RECEIPT_DATE'] < df_2['AUTHORIZATIONDATE']].head()[['AUTHORIZATIONDATE','INTEREST_START_DATE','LAST_RECEIPT_DATE']]
Out[146]:
AUTHORIZATIONDATE INTEREST_START_DATE LAST_RECEIPT_DATE
1639 2013-11-20 2013-11-20 2013-10-10
1756 2013-12-31 2013-12-31 2013-11-26
1786 2013-12-31 2013-12-31 2013-12-12
1832 2014-01-31 2014-01-31 2013-10-25
2115 2014-03-31 2014-03-31 2014-03-22
In [147]:
# As per business understanding , LAST_RECEIPT_DATE should be greater than AUTHORIZATIONDATE & INTEREST_START_DATE.
# and INTEREST_START_DATE should be greater than or equal to AUTHORIZATIONDATE. 

#Whenever INTEREST_START_DATE is less than AUTHORIZATIONDATE we will going to replace INTEREST_START_DATE with
# AUTHORIZATIONDATE

#Whenever LAST_RECEIPT_DATE is less than INTEREST_START_DATE we will going to replace LAST_RECEIPT_DATE with
# INTEREST_START_DATE
In [148]:
df_2['LAST_RECEIPT_DATE'] = np.where(df_2['LAST_RECEIPT_DATE'] < df_2['INTEREST_START_DATE'],\
                                     df_2['INTEREST_START_DATE'],df_2['LAST_RECEIPT_DATE'])
df_2['INTEREST_START_DATE'] = np.where(df_2['INTEREST_START_DATE'] < df_2['AUTHORIZATIONDATE'],\
                                       df_2['AUTHORIZATIONDATE'],df_2['INTEREST_START_DATE'])
In [149]:
df_2[df_2['LAST_RECEIPT_DATE'] < df_2['INTEREST_START_DATE']].head()
Out[149]:
const AUTHORIZATIONDATE BALANCE_EXCESS BALANCE_TENURE CITY COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES CUSTOMERID DIFF_AUTH_INT_DATE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DIFF_ORIGINAL_CURRENT_TENOR DPD DUEDAY EMI_AMOUNT EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT EXCESS_AVAILABLE FOIR INTEREST_START_DATE LAST_RECEIPT_AMOUNT LAST_RECEIPT_DATE LATEST_TRANSACTION_MONTH MAX_EMI_AMOUNT MIN_EMI_AMOUNT NET_DISBURSED_AMT NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT PRE_EMI_OS_AMOUNT SCHEMEID MOB FORECLOSURE Percentage_Completion NET_LTV_RANGE FOIR_Range Mean_Interest_Rate MEAN_EMI_AMOUNT HL LAP STHL STLAP PAID_AMOUNT
19071 1.0 2018-09-30 0.0 240 JALNA 0 0 12032791.0 -1 0.0 0.0 0 0 5 12365.0 0.0 0.0 0.0 0.0 0.36 2018-09-30 12365.0 2018-09-29 9.0 12365.0 12365.0 1.560779e+06 86.6 0.0 0 0 240 1.560779e+06 0.0 0.0 10901298.0 0 0 0.0 (80.0, 90.0] (0.2, 0.4] 13.109796 12365.0 0 0 1 0 0.0
In [150]:
df_2['DIFF_LAST_RECEIPT_INT_DATE'] = df_2['LAST_RECEIPT_DATE'] - df_2['INTEREST_START_DATE']
# Addition of new column DIFF_LAST_RECEIPT_INT_DATE which takes difference between LAST_RECEIPT_DATE & INTEREST_START_DATE
In [151]:
df_2['DIFF_LAST_RECEIPT_INT_DATE']=df_2['DIFF_LAST_RECEIPT_INT_DATE'].dt.days
In [152]:
df_2.drop(['AUTHORIZATIONDATE','DIFF_AUTH_INT_DATE','CITY','CUSTOMERID','INTEREST_START_DATE',\
           'LAST_RECEIPT_DATE','SCHEMEID','MOB'],axis=1,inplace=True)

# INTEREST_START_DATE,LAST_RECEIPT_DATE has been taken care inform of DIFF_LAST_RECEIPT_INT_DATE column 
# AUTHORIZATIONDATE and INTEREST_START_DATE is somewhat same as each other. They will not be used for model development. 
# Hence we will going to drop them.
# DIFF_AUTH_INT_DATE will not be useful for understanding. Hence we will going to drop them.
# We will going to drop CITY , SCHEMEID , MOB column as well for model development purpose as they doesn't seems to be 
# important for model building.
In [153]:
df_2.drop(['EMI_AMOUNT','MAX_EMI_AMOUNT','MIN_EMI_AMOUNT'],axis=1,inplace=True)

#All these 3 columns have been combined to form a new column Mean_emi_amount . 
#(df['MEAN_EMI_AMOUNT'] =(df['EMI_AMOUNT'] + df['MAX_EMI_AMOUNT'] + df['MIN_EMI_AMOUNT'])/3 )
#Hence we will be dropping these 3 columns
In [154]:
df_2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 19490 entries, 0 to 20011
Data columns (total 38 columns):
const                                  19490 non-null float64
BALANCE_EXCESS                         19490 non-null float64
BALANCE_TENURE                         19490 non-null int64
COMPLETED_TENURE                       19490 non-null int64
CURRENT_INTEREST_RATE_CHANGES          19490 non-null int64
DIFF_CURRENT_INTEREST_RATE_MAX_MIN     19490 non-null float64
DIFF_ORIGINAL_CURRENT_INTEREST_RATE    19490 non-null float64
DIFF_ORIGINAL_CURRENT_TENOR            19490 non-null int64
DPD                                    19490 non-null int64
DUEDAY                                 19490 non-null int64
EMI_DUEAMT                             19490 non-null float64
EMI_OS_AMOUNT                          19490 non-null float64
EXCESS_ADJUSTED_AMT                    19490 non-null float64
EXCESS_AVAILABLE                       19490 non-null float64
FOIR                                   19490 non-null float64
LAST_RECEIPT_AMOUNT                    19490 non-null float64
LATEST_TRANSACTION_MONTH               19490 non-null float64
NET_DISBURSED_AMT                      19490 non-null float64
NET_LTV                                19490 non-null float64
NET_RECEIVABLE                         19490 non-null float64
NUM_EMI_CHANGES                        19490 non-null int64
NUM_LOW_FREQ_TRANSACTIONS              19490 non-null int64
ORIGNAL_TENOR                          19490 non-null int64
OUTSTANDING_PRINCIPAL                  19490 non-null float64
PRE_EMI_DUEAMT                         19490 non-null float64
PRE_EMI_OS_AMOUNT                      19490 non-null float64
FORECLOSURE                            19490 non-null int64
Percentage_Completion                  19490 non-null float64
NET_LTV_RANGE                          19490 non-null category
FOIR_Range                             19490 non-null category
Mean_Interest_Rate                     19490 non-null float64
MEAN_EMI_AMOUNT                        19490 non-null float64
HL                                     19490 non-null uint8
LAP                                    19490 non-null uint8
STHL                                   19490 non-null uint8
STLAP                                  19490 non-null uint8
PAID_AMOUNT                            19490 non-null float64
DIFF_LAST_RECEIPT_INT_DATE             19490 non-null int64
dtypes: category(2), float64(21), int64(11), uint8(4)
memory usage: 5.0 MB

For VIF calculation purpose , we are dropping NET_LTV_RANGE and FOIR_Range . If necessary we will going to import

these two columns again for analysis.

In [155]:
df_2.drop(['NET_LTV_RANGE','FOIR_Range'],axis=1,inplace=True)

VIF CALCULATION

In [156]:
pd.Series([variance_inflation_factor(df_2.values, i) 
               for i in range(df_2.shape[1])], 
              index=df_2.columns)

# The VIF values are certain columns are very high . We will try to intelligenty feature engineer or drop the columns 
# to pull back the VIF within limits
Out[156]:
const                                  0.000000e+00
BALANCE_EXCESS                         1.948767e+12
BALANCE_TENURE                         1.040092e+13
COMPLETED_TENURE                       3.753000e+14
CURRENT_INTEREST_RATE_CHANGES          4.973058e+00
DIFF_CURRENT_INTEREST_RATE_MAX_MIN     3.775639e+00
DIFF_ORIGINAL_CURRENT_INTEREST_RATE    1.966990e+00
DIFF_ORIGINAL_CURRENT_TENOR            8.188363e+14
DPD                                    1.662080e+00
DUEDAY                                 1.289811e+00
EMI_DUEAMT                             7.350256e+00
EMI_OS_AMOUNT                          1.801440e+15
EXCESS_ADJUSTED_AMT                             inf
EXCESS_AVAILABLE                                inf
FOIR                                   1.066606e+00
LAST_RECEIPT_AMOUNT                    4.068742e+00
LATEST_TRANSACTION_MONTH               1.772947e+00
NET_DISBURSED_AMT                      2.311080e+02
NET_LTV                                1.296805e+00
NET_RECEIVABLE                         1.372839e+12
NUM_EMI_CHANGES                        2.810807e+00
NUM_LOW_FREQ_TRANSACTIONS              2.940719e+00
ORIGNAL_TENOR                          1.217189e+14
OUTSTANDING_PRINCIPAL                  1.767378e+02
PRE_EMI_DUEAMT                         2.467844e+00
PRE_EMI_OS_AMOUNT                               inf
FORECLOSURE                            1.632166e+00
Percentage_Completion                  5.469412e+00
Mean_Interest_Rate                     2.705666e+00
MEAN_EMI_AMOUNT                        6.250638e+00
HL                                     9.007199e+15
LAP                                    1.801440e+14
STHL                                            inf
STLAP                                  9.007199e+15
PAID_AMOUNT                            3.128175e+00
DIFF_LAST_RECEIPT_INT_DATE             8.936540e+01
dtype: float64
In [157]:
df_2.drop(['PRE_EMI_OS_AMOUNT'],axis=1,inplace=True)
# As VIF values is highest for this column hence we will going to drop it. and by using other columns HL,LAP,STLAP 
# we can easily determine the status of STHL column. Hence we will be dropping STHL column
In [158]:
pd.Series([variance_inflation_factor(df_2.values, i) 
               for i in range(df_2.shape[1])], 
              index=df_2.columns)
Out[158]:
const                                  0.000000e+00
BALANCE_EXCESS                                  inf
BALANCE_TENURE                         4.526231e+13
COMPLETED_TENURE                       1.452774e+14
CURRENT_INTEREST_RATE_CHANGES          4.973058e+00
DIFF_CURRENT_INTEREST_RATE_MAX_MIN     3.775639e+00
DIFF_ORIGINAL_CURRENT_INTEREST_RATE    1.966990e+00
DIFF_ORIGINAL_CURRENT_TENOR            4.503600e+15
DPD                                    1.662080e+00
DUEDAY                                 1.289811e+00
EMI_DUEAMT                             7.350256e+00
EMI_OS_AMOUNT                          3.471498e+03
EXCESS_ADJUSTED_AMT                             inf
EXCESS_AVAILABLE                                inf
FOIR                                   1.066606e+00
LAST_RECEIPT_AMOUNT                    4.068742e+00
LATEST_TRANSACTION_MONTH               1.772947e+00
NET_DISBURSED_AMT                      2.311080e+02
NET_LTV                                1.296805e+00
NET_RECEIVABLE                         1.543443e+04
NUM_EMI_CHANGES                        2.810807e+00
NUM_LOW_FREQ_TRANSACTIONS              2.940719e+00
ORIGNAL_TENOR                          1.452774e+14
OUTSTANDING_PRINCIPAL                  1.767378e+02
PRE_EMI_DUEAMT                         2.467844e+00
FORECLOSURE                            1.632166e+00
Percentage_Completion                  5.469412e+00
Mean_Interest_Rate                     2.705666e+00
MEAN_EMI_AMOUNT                        6.250638e+00
HL                                     2.251800e+15
LAP                                    1.251000e+14
STHL                                   9.007199e+15
STLAP                                           inf
PAID_AMOUNT                            3.128175e+00
DIFF_LAST_RECEIPT_INT_DATE             8.936540e+01
dtype: float64
In [159]:
df_2.drop(['HL'],axis=1,inplace=True) 

# PRE_EMI_OS_AMOUNT 6.928615e+14
# EMI_RECEIVED_AMT 2.144571e+14
# Looking at the heatmap , we can observe that EMI_RECEIVED_AMT is highly correlated with other variables . But this 
# correlation is not seen for PRE_EMI_OS_AMOUNT . Hence we will be dropping EMI_RECEIVED_AMT rather than PRE_EMI_OS_AMOUNT
In [160]:
pd.Series([variance_inflation_factor(df_2.values, i) 
               for i in range(df_2.shape[1])], 
              index=df_2.columns)
Out[160]:
const                                  1.600901e+02
BALANCE_EXCESS                                  inf
BALANCE_TENURE                         5.629500e+14
COMPLETED_TENURE                       1.958087e+14
CURRENT_INTEREST_RATE_CHANGES          4.973058e+00
DIFF_CURRENT_INTEREST_RATE_MAX_MIN     3.775639e+00
DIFF_ORIGINAL_CURRENT_INTEREST_RATE    1.966990e+00
DIFF_ORIGINAL_CURRENT_TENOR            1.801440e+15
DPD                                    1.662080e+00
DUEDAY                                 1.289811e+00
EMI_DUEAMT                             7.350256e+00
EMI_OS_AMOUNT                          3.471498e+03
EXCESS_ADJUSTED_AMT                             inf
EXCESS_AVAILABLE                                inf
FOIR                                   1.066606e+00
LAST_RECEIPT_AMOUNT                    4.068741e+00
LATEST_TRANSACTION_MONTH               1.772947e+00
NET_DISBURSED_AMT                      2.311080e+02
NET_LTV                                1.296805e+00
NET_RECEIVABLE                         1.543443e+04
NUM_EMI_CHANGES                        2.810807e+00
NUM_LOW_FREQ_TRANSACTIONS              2.940719e+00
ORIGNAL_TENOR                          4.549091e+13
OUTSTANDING_PRINCIPAL                  1.767378e+02
PRE_EMI_DUEAMT                         2.467844e+00
FORECLOSURE                            1.632166e+00
Percentage_Completion                  5.469412e+00
Mean_Interest_Rate                     2.705666e+00
MEAN_EMI_AMOUNT                        6.250638e+00
LAP                                    3.416333e+00
STHL                                   4.351524e+00
STLAP                                  4.768820e+00
PAID_AMOUNT                            3.128175e+00
DIFF_LAST_RECEIPT_INT_DATE             8.936540e+01
dtype: float64

Establishing relationship between different columns . Below are the columns which are derived from the other columns. This information

will help us in performing feature engineering.

PRE_EMI_DUEAMT= PRE_EMI_RECEIVED_AMT ## +PRE_EMI_OS_AMOUNT

OUTSTANDING_PRINCIPAL+ PAID_PRINCIPAL ## = LOAN_AMT

BALANCE_TENURE + COMPLETED_TENURE = ## CURRENT_TENOR

ORIGNAL_TENOR - CURRENT_TENOR = DIFF_ORIGINAL_CURRENT_TENOR

EMI_OS_AMOUNT - BALANCE_EXCESS = NET_RECEIVABLE

PRE_EMI_DUEAMT - PRE_EMI_OS_AMOUNT = PRE_EMI_RECEIVED_AMT

INTEREST_START_DATE - AUTHORIZATIONDATE = DIFF_AUTH_INT_DATE

CURRENT_INTEREST_RATE_MAX - ##CURRENT_INTEREST_RATE_MIN =

DIFF_CURRENT_INTEREST_RATE_MAX_MIN

MAX_EMI_AMOUNT - MIN_EMI_AMOUNT = ##DIFF_EMI_AMOUNT_MAX_MIN

ORIGNAL_INTEREST_RATE ## - CURRENT_INTEREST_RATE = DIFF_ORIGINAL_CURRENT_INTEREST_RATE

BALANCE_TENURE,COMPLETED_TENURE,CURRENT_TENOR,DIFF_ORIGINAL_CURRENT_TENOR,ORIGNAL_TENOR

Mean_Interest_Rate = #CURRENT_INTEREST_RATE_MAX + CURRENT_INTEREST_RATE + CURRENT_INTEREST_RATE_MIN

'CURRENT_INTEREST_RATE','CURRENT_INTEREST_RATE_MAX','CURRENT_INTEREST_RATE_MIN'

In [162]:
pd.Series([variance_inflation_factor(df_2.values, i) 
               for i in range(df_2.shape[1])], 
              index=df_2.columns)
Out[162]:
const                                  1.600901e+02
BALANCE_EXCESS                                  inf
BALANCE_TENURE                         5.629500e+14
COMPLETED_TENURE                       1.958087e+14
CURRENT_INTEREST_RATE_CHANGES          4.973058e+00
DIFF_CURRENT_INTEREST_RATE_MAX_MIN     3.775639e+00
DIFF_ORIGINAL_CURRENT_INTEREST_RATE    1.966990e+00
DIFF_ORIGINAL_CURRENT_TENOR            1.801440e+15
DPD                                    1.662080e+00
DUEDAY                                 1.289811e+00
EMI_DUEAMT                             7.350256e+00
EMI_OS_AMOUNT                          3.471498e+03
EXCESS_ADJUSTED_AMT                             inf
EXCESS_AVAILABLE                                inf
FOIR                                   1.066606e+00
LAST_RECEIPT_AMOUNT                    4.068741e+00
LATEST_TRANSACTION_MONTH               1.772947e+00
NET_DISBURSED_AMT                      2.311080e+02
NET_LTV                                1.296805e+00
NET_RECEIVABLE                         1.543443e+04
NUM_EMI_CHANGES                        2.810807e+00
NUM_LOW_FREQ_TRANSACTIONS              2.940719e+00
ORIGNAL_TENOR                          4.549091e+13
OUTSTANDING_PRINCIPAL                  1.767378e+02
PRE_EMI_DUEAMT                         2.467844e+00
FORECLOSURE                            1.632166e+00
Percentage_Completion                  5.469412e+00
Mean_Interest_Rate                     2.705666e+00
MEAN_EMI_AMOUNT                        6.250638e+00
LAP                                    3.416333e+00
STHL                                   4.351524e+00
STLAP                                  4.768820e+00
PAID_AMOUNT                            3.128175e+00
DIFF_LAST_RECEIPT_INT_DATE             8.936540e+01
dtype: float64
In [163]:
df_2.drop(['EXCESS_AVAILABLE'],axis=1,inplace=True)
# We will be dropping EXCESS_AVAILABLE as it is has high correlation with other variable and has infinity VIF values.
In [164]:
pd.Series([variance_inflation_factor(df_2.values, i) 
               for i in range(df_2.shape[1])], 
              index=df_2.columns)
Out[164]:
const                                  1.600643e+02
BALANCE_EXCESS                         1.570794e+04
BALANCE_TENURE                                  inf
COMPLETED_TENURE                                inf
CURRENT_INTEREST_RATE_CHANGES          4.973026e+00
DIFF_CURRENT_INTEREST_RATE_MAX_MIN     3.775580e+00
DIFF_ORIGINAL_CURRENT_INTEREST_RATE    1.966990e+00
DIFF_ORIGINAL_CURRENT_TENOR                     inf
DPD                                    1.662067e+00
DUEDAY                                 1.289785e+00
EMI_DUEAMT                             7.350207e+00
EMI_OS_AMOUNT                          3.471497e+03
EXCESS_ADJUSTED_AMT                    1.035893e+01
FOIR                                   1.066606e+00
LAST_RECEIPT_AMOUNT                    4.062408e+00
LATEST_TRANSACTION_MONTH               1.772853e+00
NET_DISBURSED_AMT                      2.311068e+02
NET_LTV                                1.296804e+00
NET_RECEIVABLE                         1.543443e+04
NUM_EMI_CHANGES                        2.810777e+00
NUM_LOW_FREQ_TRANSACTIONS              2.940661e+00
ORIGNAL_TENOR                                   inf
OUTSTANDING_PRINCIPAL                  1.767373e+02
PRE_EMI_DUEAMT                         2.467841e+00
FORECLOSURE                            1.632090e+00
Percentage_Completion                  5.469408e+00
Mean_Interest_Rate                     2.705626e+00
MEAN_EMI_AMOUNT                        6.248546e+00
LAP                                    3.416333e+00
STHL                                   4.351454e+00
STLAP                                  4.768731e+00
PAID_AMOUNT                            3.127565e+00
DIFF_LAST_RECEIPT_INT_DATE             8.936525e+01
dtype: float64
In [165]:
df_2.drop(['DIFF_ORIGINAL_CURRENT_TENOR'],axis=1,inplace=True)
In [166]:
pd.Series([variance_inflation_factor(df_2.values, i) 
               for i in range(df_2.shape[1])], 
              index=df_2.columns)
Out[166]:
const                                    160.064273
BALANCE_EXCESS                         15707.937499
BALANCE_TENURE                             7.051333
COMPLETED_TENURE                          89.589886
CURRENT_INTEREST_RATE_CHANGES              4.973026
DIFF_CURRENT_INTEREST_RATE_MAX_MIN         3.775580
DIFF_ORIGINAL_CURRENT_INTEREST_RATE        1.966990
DPD                                        1.662067
DUEDAY                                     1.289785
EMI_DUEAMT                                 7.350207
EMI_OS_AMOUNT                           3471.497111
EXCESS_ADJUSTED_AMT                       10.358931
FOIR                                       1.066606
LAST_RECEIPT_AMOUNT                        4.062408
LATEST_TRANSACTION_MONTH                   1.772853
NET_DISBURSED_AMT                        231.106823
NET_LTV                                    1.296804
NET_RECEIVABLE                         15434.427592
NUM_EMI_CHANGES                            2.810777
NUM_LOW_FREQ_TRANSACTIONS                  2.940661
ORIGNAL_TENOR                              5.062865
OUTSTANDING_PRINCIPAL                    176.737270
PRE_EMI_DUEAMT                             2.467841
FORECLOSURE                                1.632090
Percentage_Completion                      5.469408
Mean_Interest_Rate                         2.705626
MEAN_EMI_AMOUNT                            6.248546
LAP                                        3.416333
STHL                                       4.351454
STLAP                                      4.768731
PAID_AMOUNT                                3.127565
DIFF_LAST_RECEIPT_INT_DATE                89.365247
dtype: float64
In [167]:
df_2.drop(['BALANCE_EXCESS'],axis=1,inplace=True)
In [168]:
pd.Series([variance_inflation_factor(df_2.values, i) 
               for i in range(df_2.shape[1])], 
              index=df_2.columns)
Out[168]:
const                                  160.057119
BALANCE_TENURE                           7.051238
COMPLETED_TENURE                        88.766428
CURRENT_INTEREST_RATE_CHANGES            4.972178
DIFF_CURRENT_INTEREST_RATE_MAX_MIN       3.775578
DIFF_ORIGINAL_CURRENT_INTEREST_RATE      1.966940
DPD                                      1.641396
DUEDAY                                   1.289670
EMI_DUEAMT                               7.347377
EMI_OS_AMOUNT                            1.673876
EXCESS_ADJUSTED_AMT                     10.358049
FOIR                                     1.066601
LAST_RECEIPT_AMOUNT                      4.062376
LATEST_TRANSACTION_MONTH                 1.772628
NET_DISBURSED_AMT                      231.074509
NET_LTV                                  1.296660
NET_RECEIVABLE                           1.961447
NUM_EMI_CHANGES                          2.799836
NUM_LOW_FREQ_TRANSACTIONS                2.935106
ORIGNAL_TENOR                            5.062462
OUTSTANDING_PRINCIPAL                  176.716790
PRE_EMI_DUEAMT                           2.459391
FORECLOSURE                              1.632075
Percentage_Completion                    5.469397
Mean_Interest_Rate                       2.705620
MEAN_EMI_AMOUNT                          6.248436
LAP                                      3.415580
STHL                                     4.351214
STLAP                                    4.768113
PAID_AMOUNT                              3.123075
DIFF_LAST_RECEIPT_INT_DATE              88.612197
dtype: float64
In [169]:
df_2.drop(['NET_DISBURSED_AMT'],axis=1,inplace=True)
In [170]:
pd.Series([variance_inflation_factor(df_2.values, i) 
               for i in range(df_2.shape[1])], 
              index=df_2.columns)
Out[170]:
const                                  159.291807
BALANCE_TENURE                           7.026818
COMPLETED_TENURE                        88.463086
CURRENT_INTEREST_RATE_CHANGES            4.971445
DIFF_CURRENT_INTEREST_RATE_MAX_MIN       3.775554
DIFF_ORIGINAL_CURRENT_INTEREST_RATE      1.966848
DPD                                      1.638488
DUEDAY                                   1.287672
EMI_DUEAMT                               1.924985
EMI_OS_AMOUNT                            1.645146
EXCESS_ADJUSTED_AMT                      3.240905
FOIR                                     1.066039
LAST_RECEIPT_AMOUNT                      3.976162
LATEST_TRANSACTION_MONTH                 1.772547
NET_LTV                                  1.296276
NET_RECEIVABLE                           1.937128
NUM_EMI_CHANGES                          2.797211
NUM_LOW_FREQ_TRANSACTIONS                2.924469
ORIGNAL_TENOR                            5.031526
OUTSTANDING_PRINCIPAL                    1.946320
PRE_EMI_DUEAMT                           2.459307
FORECLOSURE                              1.631990
Percentage_Completion                    4.959791
Mean_Interest_Rate                       2.700545
MEAN_EMI_AMOUNT                          6.226511
LAP                                      3.412323
STHL                                     4.350965
STLAP                                    4.767093
PAID_AMOUNT                              2.871289
DIFF_LAST_RECEIPT_INT_DATE              88.585834
dtype: float64
In [171]:
df_2.drop(['DIFF_LAST_RECEIPT_INT_DATE'],axis=1,inplace=True)
In [172]:
pd.Series([variance_inflation_factor(df_2.values, i) 
               for i in range(df_2.shape[1])], 
              index=df_2.columns)
Out[172]:
const                                  159.288794
BALANCE_TENURE                           7.016944
COMPLETED_TENURE                         4.515033
CURRENT_INTEREST_RATE_CHANGES            4.966703
DIFF_CURRENT_INTEREST_RATE_MAX_MIN       3.775552
DIFF_ORIGINAL_CURRENT_INTEREST_RATE      1.966795
DPD                                      1.635699
DUEDAY                                   1.286530
EMI_DUEAMT                               1.923758
EMI_OS_AMOUNT                            1.644338
EXCESS_ADJUSTED_AMT                      3.238440
FOIR                                     1.066031
LAST_RECEIPT_AMOUNT                      3.976156
LATEST_TRANSACTION_MONTH                 1.772105
NET_LTV                                  1.290114
NET_RECEIVABLE                           1.936328
NUM_EMI_CHANGES                          2.330704
NUM_LOW_FREQ_TRANSACTIONS                2.835505
ORIGNAL_TENOR                            5.026520
OUTSTANDING_PRINCIPAL                    1.917337
PRE_EMI_DUEAMT                           2.055933
FORECLOSURE                              1.631839
Percentage_Completion                    4.959771
Mean_Interest_Rate                       2.688591
MEAN_EMI_AMOUNT                          6.224938
LAP                                      3.399959
STHL                                     4.349953
STLAP                                    4.733040
PAID_AMOUNT                              2.783096
dtype: float64
In [173]:
plt.figure(figsize=(20,20))
sns.heatmap(df_2.corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})
Out[173]:
<matplotlib.axes._subplots.AxesSubplot at 0x7d9f881668>
In [174]:
df_2.head()
Out[174]:
const BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT FORECLOSURE Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT LAP STHL STLAP PAID_AMOUNT
0 1.0 0 45 1 0.312138 -2.809242 0 1 8.614898e+06 175477.782641 2.135869e+05 0.60 1000000.0 5.0 40.06 175477.782641 1 1 120 1.164472e+05 7994.273589 1 100.000000 13.525980 389039.333333 0 0 0 1.431938e+07
1 1.0 99 38 0 0.000000 -3.121380 0 1 1.061903e+07 279448.084560 0.000000e+00 0.60 126530.0 11.0 84.31 279448.084560 0 0 180 1.234760e+07 100504.575864 1 27.737226 13.734072 126530.000000 0 0 0 1.188781e+07
2 1.0 231 81 2 0.624276 -4.057794 0 5 4.670211e+06 0.000000 1.179716e+06 0.72 22878.0 8.0 50.89 0.000000 2 3 180 2.994184e+06 5947.630536 1 25.961538 15.814992 181918.666667 0 0 0 3.929335e+06
3 1.0 0 91 4 2.497104 -0.624276 0 1 1.313098e+07 63659.280762 6.770848e+06 0.60 65741.0 5.0 84.63 63659.280762 3 8 180 4.295120e+04 60322.180776 1 100.000000 12.069336 543827.333333 0 0 0 1.462863e+07
4 1.0 215 89 2 0.624276 -4.057794 0 5 1.048923e+07 0.000000 1.202181e+05 0.83 54433.0 4.0 30.94 0.000000 0 3 180 7.200653e+06 27732.787464 1 29.276316 15.190716 54433.000000 0 0 0 7.149063e+06

Train Test Split

In [175]:
df4=df_2.copy()
In [176]:
from sklearn.model_selection import train_test_split
In [177]:
x=df4.drop(['FORECLOSURE'],axis=1)
In [178]:
y=df4.pop('FORECLOSURE')
In [179]:
x_new=x.copy()
y_new=y.copy()
In [180]:
x_train_new,x_test_new,y_train_new,y_test_new=train_test_split(x_new,y_new,test_size=0.3,random_state=0)
In [181]:
x_new_copy=pd.DataFrame()
x_new_copy['LAP']=x_train_new['LAP']
x_new_copy['STHL']=x_train_new['STHL']
x_new_copy['STLAP']=x_train_new['STLAP']
In [182]:
x_new_copy.head(2)
Out[182]:
LAP STHL STLAP
16187 0 0 1
11072 1 0 0
In [183]:
x_new_copy.reset_index(drop=True,inplace=True)

Z score Transformation

In [184]:
from scipy.stats import zscore
In [185]:
from sklearn.preprocessing import StandardScaler
In [186]:
x_train_new.drop(['const','LAP','STHL','STLAP'],axis=1,inplace=True)
In [187]:
x_test_new.drop(['const'],axis=1,inplace=True)
In [188]:
x_train_new.shape
Out[188]:
(13643, 24)
In [189]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
In [190]:
x_train_new=pd.DataFrame(ss.fit_transform(x_train_new),columns = x_train_new.columns)
In [191]:
x_train_new.head()
Out[191]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT
0 0.018050 -0.705212 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.262438 -0.065941 -0.107284 -0.122503 -0.075827 0.463596 -1.732552 0.013175 -0.764159 -0.319120 -0.070397 -0.329322 -0.199913 -0.577410 1.737300 -0.133733 -0.260036
1 0.127450 -0.523551 0.200694 -0.111365 -0.117511 -0.118447 -0.287603 -0.261256 -0.065941 -0.107284 -0.438120 -0.080292 -0.604188 -1.386642 0.035201 -0.382359 -0.705400 -0.070397 -0.345788 -0.173526 -0.476625 -0.237325 -0.142860 -0.258269
2 0.705704 0.324202 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.231709 -0.065941 0.136986 -0.309004 -0.083243 0.463596 -0.038967 0.035201 0.381240 0.067159 1.272128 -0.373014 -0.198362 -0.131458 0.126204 -0.143385 -0.151162
3 -0.091349 -0.281336 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.175345 -0.065941 -0.107284 0.207461 -0.059544 -0.604188 0.053780 0.035201 -0.382359 -0.319120 -0.070397 -0.192704 -0.181246 -0.298152 1.365509 -0.108466 -0.186443
4 -0.216377 0.203095 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.161634 -0.065941 -0.107284 -0.265965 -0.069028 0.463596 1.138356 0.035201 -0.382359 -0.705400 -0.070397 -0.262827 -0.104943 0.020999 0.745857 -0.124190 -0.172700
In [192]:
x_test_new.head()
Out[192]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT LAP STHL STLAP PAID_AMOUNT
185 170 28 0 0.000000 -0.936414 0 15 1.999455e+06 0.0 0.0 1.00 32333.00 5.0 41.42 0.000000 0 1 180 3.669988e+06 24302.906208 14.141414 16.855452 32333.000000 1 0 0 1.375679e+06
17908 207 3 1 1.248552 -1.248552 0 5 1.958014e+05 0.0 0.0 0.41 29552.00 12.0 61.11 0.000000 2 0 180 4.105786e+06 8436.668640 1.428571 13.317888 20701.333333 1 0 0 1.388338e+05
51 239 50 0 0.000000 -3.433518 0 15 7.147084e+06 0.0 0.0 0.80 398.41 6.0 60.39 0.000000 5 3 176 7.779062e+06 27063.596208 17.301038 18.104004 44513.690000 1 0 0 4.876733e+06
15694 175 5 0 0.000000 0.000000 0 5 2.551098e+05 0.0 0.0 0.34 23102.00 11.0 37.83 -260.609136 2 3 180 2.378284e+06 10757.856792 2.777778 19.602266 17025.000000 0 0 1 1.807275e+05
17451 236 4 0 0.000000 0.000000 0 5 8.754700e+04 0.0 0.0 0.49 9910.00 12.0 51.96 -2358.733536 3 2 240 1.345620e+06 3312.828000 1.666667 14.982624 7503.333333 0 1 0 6.176331e+04
In [193]:
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['BALANCE_TENURE'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['DPD'],ax=ax[0,1])
sns.distplot(x_train_new['COMPLETED_TENURE'],ax=ax[0,2])
sns.distplot(x_train_new['CURRENT_INTEREST_RATE_CHANGES'],ax=ax[1,0])
sns.distplot(x_train_new['DIFF_CURRENT_INTEREST_RATE_MAX_MIN'],ax=ax[1,1])
sns.distplot(x_train_new['DIFF_ORIGINAL_CURRENT_INTEREST_RATE'],ax=ax[1,2])
Out[193]:
<matplotlib.axes._subplots.AxesSubplot at 0x7da277c320>
In [194]:
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['DUEDAY'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['EMI_DUEAMT'],ax=ax[0,1])
sns.distplot(x_train_new['EMI_OS_AMOUNT'],ax=ax[0,2])
sns.distplot(x_train_new['EXCESS_ADJUSTED_AMT'],ax=ax[1,0])
sns.distplot(x_train_new['FOIR'],ax=ax[1,1])
sns.distplot(x_train_new['LAST_RECEIPT_AMOUNT'],ax=ax[1,2])
Out[194]:
<matplotlib.axes._subplots.AxesSubplot at 0x7da5e25a58>
In [195]:
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(df4['LATEST_TRANSACTION_MONTH'],ax=ax[0,0],kde=True)
sns.distplot(df4['NET_LTV'],ax=ax[0,1])
sns.distplot(df4['NET_RECEIVABLE'],ax=ax[0,2])
sns.distplot(df4['NUM_EMI_CHANGES'],ax=ax[1,0])
sns.distplot(df4['NUM_LOW_FREQ_TRANSACTIONS'],ax=ax[1,1])
sns.distplot(df4['ORIGNAL_TENOR'],ax=ax[1,2])
Out[195]:
<matplotlib.axes._subplots.AxesSubplot at 0x7da6621a20>
In [196]:
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['OUTSTANDING_PRINCIPAL'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['PRE_EMI_DUEAMT'],ax=ax[0,1])
sns.distplot(x_train_new['Percentage_Completion'],ax=ax[0,2])
sns.distplot(x_train_new['Mean_Interest_Rate'],ax=ax[1,0])
sns.distplot(x_train_new['MEAN_EMI_AMOUNT'],ax=ax[1,1])
sns.distplot(x_train_new['PAID_AMOUNT'],ax=ax[1,2])
Out[196]:
<matplotlib.axes._subplots.AxesSubplot at 0x7da6d929b0>
In [197]:
for col in x_train_new.columns:
    print("{} column has {} records having zscore greater than 3".format(col,len(x_train_new[x_train_new[col]>3])))
    print("{} column has {} records having zscore lesser than -3".format(col,len(x_train_new[x_train_new[col]<-3]))) 
    print("")
BALANCE_TENURE column has 137 records having zscore greater than 3
BALANCE_TENURE column has 0 records having zscore lesser than -3

COMPLETED_TENURE column has 242 records having zscore greater than 3
COMPLETED_TENURE column has 0 records having zscore lesser than -3

CURRENT_INTEREST_RATE_CHANGES column has 172 records having zscore greater than 3
CURRENT_INTEREST_RATE_CHANGES column has 0 records having zscore lesser than -3

DIFF_CURRENT_INTEREST_RATE_MAX_MIN column has 143 records having zscore greater than 3
DIFF_CURRENT_INTEREST_RATE_MAX_MIN column has 0 records having zscore lesser than -3

DIFF_ORIGINAL_CURRENT_INTEREST_RATE column has 135 records having zscore greater than 3
DIFF_ORIGINAL_CURRENT_INTEREST_RATE column has 34 records having zscore lesser than -3

DPD column has 103 records having zscore greater than 3
DPD column has 0 records having zscore lesser than -3

DUEDAY column has 1097 records having zscore greater than 3
DUEDAY column has 0 records having zscore lesser than -3

EMI_DUEAMT column has 159 records having zscore greater than 3
EMI_DUEAMT column has 0 records having zscore lesser than -3

EMI_OS_AMOUNT column has 62 records having zscore greater than 3
EMI_OS_AMOUNT column has 0 records having zscore lesser than -3

EXCESS_ADJUSTED_AMT column has 90 records having zscore greater than 3
EXCESS_ADJUSTED_AMT column has 0 records having zscore lesser than -3

FOIR column has 76 records having zscore greater than 3
FOIR column has 0 records having zscore lesser than -3

LAST_RECEIPT_AMOUNT column has 43 records having zscore greater than 3
LAST_RECEIPT_AMOUNT column has 0 records having zscore lesser than -3

LATEST_TRANSACTION_MONTH column has 0 records having zscore greater than 3
LATEST_TRANSACTION_MONTH column has 487 records having zscore lesser than -3

NET_LTV column has 0 records having zscore greater than 3
NET_LTV column has 0 records having zscore lesser than -3

NET_RECEIVABLE column has 11 records having zscore greater than 3
NET_RECEIVABLE column has 49 records having zscore lesser than -3

NUM_EMI_CHANGES column has 330 records having zscore greater than 3
NUM_EMI_CHANGES column has 0 records having zscore lesser than -3

NUM_LOW_FREQ_TRANSACTIONS column has 359 records having zscore greater than 3
NUM_LOW_FREQ_TRANSACTIONS column has 0 records having zscore lesser than -3

ORIGNAL_TENOR column has 0 records having zscore greater than 3
ORIGNAL_TENOR column has 36 records having zscore lesser than -3

OUTSTANDING_PRINCIPAL column has 187 records having zscore greater than 3
OUTSTANDING_PRINCIPAL column has 0 records having zscore lesser than -3

PRE_EMI_DUEAMT column has 116 records having zscore greater than 3
PRE_EMI_DUEAMT column has 0 records having zscore lesser than -3

Percentage_Completion column has 283 records having zscore greater than 3
Percentage_Completion column has 0 records having zscore lesser than -3

Mean_Interest_Rate column has 4 records having zscore greater than 3
Mean_Interest_Rate column has 1 records having zscore lesser than -3

MEAN_EMI_AMOUNT column has 71 records having zscore greater than 3
MEAN_EMI_AMOUNT column has 0 records having zscore lesser than -3

PAID_AMOUNT column has 148 records having zscore greater than 3
PAID_AMOUNT column has 0 records having zscore lesser than -3

In [198]:
for col in x_train_new.columns:
    x_train_new[col]=np.where(x_train_new[col] >= 3,3,x_train_new[col])
    x_train_new[col]=np.where(x_train_new[col] <= -3,-3,x_train_new[col])
In [199]:
for col in x_train_new.columns:
    print("{} column has {} records having zscore greater than 3".format(col,len(x_train_new[x_train_new[col]>3])))
    print("{} column has {} records having zscore lesser than -3".format(col,len(x_train_new[x_train_new[col]<-3]))) 
    print("")
BALANCE_TENURE column has 0 records having zscore greater than 3
BALANCE_TENURE column has 0 records having zscore lesser than -3

COMPLETED_TENURE column has 0 records having zscore greater than 3
COMPLETED_TENURE column has 0 records having zscore lesser than -3

CURRENT_INTEREST_RATE_CHANGES column has 0 records having zscore greater than 3
CURRENT_INTEREST_RATE_CHANGES column has 0 records having zscore lesser than -3

DIFF_CURRENT_INTEREST_RATE_MAX_MIN column has 0 records having zscore greater than 3
DIFF_CURRENT_INTEREST_RATE_MAX_MIN column has 0 records having zscore lesser than -3

DIFF_ORIGINAL_CURRENT_INTEREST_RATE column has 0 records having zscore greater than 3
DIFF_ORIGINAL_CURRENT_INTEREST_RATE column has 0 records having zscore lesser than -3

DPD column has 0 records having zscore greater than 3
DPD column has 0 records having zscore lesser than -3

DUEDAY column has 0 records having zscore greater than 3
DUEDAY column has 0 records having zscore lesser than -3

EMI_DUEAMT column has 0 records having zscore greater than 3
EMI_DUEAMT column has 0 records having zscore lesser than -3

EMI_OS_AMOUNT column has 0 records having zscore greater than 3
EMI_OS_AMOUNT column has 0 records having zscore lesser than -3

EXCESS_ADJUSTED_AMT column has 0 records having zscore greater than 3
EXCESS_ADJUSTED_AMT column has 0 records having zscore lesser than -3

FOIR column has 0 records having zscore greater than 3
FOIR column has 0 records having zscore lesser than -3

LAST_RECEIPT_AMOUNT column has 0 records having zscore greater than 3
LAST_RECEIPT_AMOUNT column has 0 records having zscore lesser than -3

LATEST_TRANSACTION_MONTH column has 0 records having zscore greater than 3
LATEST_TRANSACTION_MONTH column has 0 records having zscore lesser than -3

NET_LTV column has 0 records having zscore greater than 3
NET_LTV column has 0 records having zscore lesser than -3

NET_RECEIVABLE column has 0 records having zscore greater than 3
NET_RECEIVABLE column has 0 records having zscore lesser than -3

NUM_EMI_CHANGES column has 0 records having zscore greater than 3
NUM_EMI_CHANGES column has 0 records having zscore lesser than -3

NUM_LOW_FREQ_TRANSACTIONS column has 0 records having zscore greater than 3
NUM_LOW_FREQ_TRANSACTIONS column has 0 records having zscore lesser than -3

ORIGNAL_TENOR column has 0 records having zscore greater than 3
ORIGNAL_TENOR column has 0 records having zscore lesser than -3

OUTSTANDING_PRINCIPAL column has 0 records having zscore greater than 3
OUTSTANDING_PRINCIPAL column has 0 records having zscore lesser than -3

PRE_EMI_DUEAMT column has 0 records having zscore greater than 3
PRE_EMI_DUEAMT column has 0 records having zscore lesser than -3

Percentage_Completion column has 0 records having zscore greater than 3
Percentage_Completion column has 0 records having zscore lesser than -3

Mean_Interest_Rate column has 0 records having zscore greater than 3
Mean_Interest_Rate column has 0 records having zscore lesser than -3

MEAN_EMI_AMOUNT column has 0 records having zscore greater than 3
MEAN_EMI_AMOUNT column has 0 records having zscore lesser than -3

PAID_AMOUNT column has 0 records having zscore greater than 3
PAID_AMOUNT column has 0 records having zscore lesser than -3

In [200]:
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['BALANCE_TENURE'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['DPD'],ax=ax[0,1])
sns.distplot(x_train_new['COMPLETED_TENURE'],ax=ax[0,2])
sns.distplot(x_train_new['CURRENT_INTEREST_RATE_CHANGES'],ax=ax[1,0])
sns.distplot(x_train_new['DIFF_CURRENT_INTEREST_RATE_MAX_MIN'],ax=ax[1,1])
sns.distplot(x_train_new['DIFF_ORIGINAL_CURRENT_INTEREST_RATE'],ax=ax[1,2])
Out[200]:
<matplotlib.axes._subplots.AxesSubplot at 0x7da759d860>
In [201]:
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['DUEDAY'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['EMI_DUEAMT'],ax=ax[0,1])
sns.distplot(x_train_new['EMI_OS_AMOUNT'],ax=ax[0,2])
sns.distplot(x_train_new['EXCESS_ADJUSTED_AMT'],ax=ax[1,0])
sns.distplot(x_train_new['FOIR'],ax=ax[1,1])
sns.distplot(x_train_new['LAST_RECEIPT_AMOUNT'],ax=ax[1,2])
Out[201]:
<matplotlib.axes._subplots.AxesSubplot at 0x7da8cbe0f0>
In [202]:
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['LATEST_TRANSACTION_MONTH'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['NET_LTV'],ax=ax[0,1])
sns.distplot(x_train_new['NET_RECEIVABLE'],ax=ax[0,2])
sns.distplot(x_train_new['NUM_EMI_CHANGES'],ax=ax[1,0])
sns.distplot(x_train_new['NUM_LOW_FREQ_TRANSACTIONS'],ax=ax[1,1])
sns.distplot(x_train_new['ORIGNAL_TENOR'],ax=ax[1,2])
Out[202]:
<matplotlib.axes._subplots.AxesSubplot at 0x7da94b04a8>
In [203]:
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['OUTSTANDING_PRINCIPAL'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['PRE_EMI_DUEAMT'],ax=ax[0,1])
sns.distplot(x_train_new['Percentage_Completion'],ax=ax[0,2])
sns.distplot(x_train_new['Mean_Interest_Rate'],ax=ax[1,0])
sns.distplot(x_train_new['MEAN_EMI_AMOUNT'],ax=ax[1,1])
sns.distplot(x_train_new['PAID_AMOUNT'],ax=ax[1,2])
Out[203]:
<matplotlib.axes._subplots.AxesSubplot at 0x7da9c63898>
In [204]:
x_train_new.head()
Out[204]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT
0 0.018050 -0.705212 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.262438 -0.065941 -0.107284 -0.122503 -0.075827 0.463596 -1.732552 0.013175 -0.764159 -0.319120 -0.070397 -0.329322 -0.199913 -0.577410 1.737300 -0.133733 -0.260036
1 0.127450 -0.523551 0.200694 -0.111365 -0.117511 -0.118447 -0.287603 -0.261256 -0.065941 -0.107284 -0.438120 -0.080292 -0.604188 -1.386642 0.035201 -0.382359 -0.705400 -0.070397 -0.345788 -0.173526 -0.476625 -0.237325 -0.142860 -0.258269
2 0.705704 0.324202 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.231709 -0.065941 0.136986 -0.309004 -0.083243 0.463596 -0.038967 0.035201 0.381240 0.067159 1.272128 -0.373014 -0.198362 -0.131458 0.126204 -0.143385 -0.151162
3 -0.091349 -0.281336 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.175345 -0.065941 -0.107284 0.207461 -0.059544 -0.604188 0.053780 0.035201 -0.382359 -0.319120 -0.070397 -0.192704 -0.181246 -0.298152 1.365509 -0.108466 -0.186443
4 -0.216377 0.203095 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.161634 -0.065941 -0.107284 -0.265965 -0.069028 0.463596 1.138356 0.035201 -0.382359 -0.705400 -0.070397 -0.262827 -0.104943 0.020999 0.745857 -0.124190 -0.172700

Z scaling Test Data

In [205]:
df_test=x_test_new.copy()
In [206]:
df_test.head(2)
Out[206]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT LAP STHL STLAP PAID_AMOUNT
185 170 28 0 0.000000 -0.936414 0 15 1.999455e+06 0.0 0.0 1.00 32333.0 5.0 41.42 0.0 0 1 180 3.669988e+06 24302.906208 14.141414 16.855452 32333.000000 1 0 0 1.375679e+06
17908 207 3 1 1.248552 -1.248552 0 5 1.958014e+05 0.0 0.0 0.41 29552.0 12.0 61.11 0.0 2 0 180 4.105786e+06 8436.668640 1.428571 13.317888 20701.333333 1 0 0 1.388338e+05
In [207]:
df_test1=pd.DataFrame()
df_test1['LAP']=x_test_new['LAP']
df_test1['STHL']=x_test_new['STHL']
df_test1['STLAP']=x_test_new['STLAP']
In [208]:
df_test1.head(2)
Out[208]:
LAP STHL STLAP
185 1 0 0
17908 1 0 0
In [209]:
df_test1.reset_index(drop=True,inplace=True)
In [210]:
x_test_new.drop(['LAP','STHL','STLAP'],axis=1,inplace=True)
In [211]:
x_test_new=pd.DataFrame(ss.transform(x_test_new),columns = x_test_new.columns)
In [212]:
x_test_new.head(10)
Out[212]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT
0 -0.044464 0.626971 -0.676893 -0.622898 -0.614255 -0.118447 3.366490 -0.002305 -0.065941 -0.107284 0.566118 -0.053824 -2.027899 -0.450178 0.035201 -1.145959 -0.705400 -0.070397 -0.128890 -0.118697 0.198709 0.869787 -0.068570 -0.039478
1 0.533790 -0.886874 0.200694 0.655934 -0.969073 -0.118447 -0.287603 -0.259112 -0.065941 -0.107284 -0.280311 -0.056991 0.463596 0.481554 0.035201 -0.382359 -1.091680 -0.070397 -0.090971 -0.183111 -0.714189 -0.534758 -0.101512 -0.256752
2 1.033902 1.959155 -0.676893 -0.622898 -3.452793 -0.118447 3.366490 0.730623 -0.065941 -0.107284 0.279193 -0.090194 -1.671972 0.447483 0.035201 0.763040 0.067159 -0.159898 0.228645 -0.107490 0.425598 1.365509 -0.034073 0.575544
3 0.033679 -0.765766 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.250668 -0.065941 -0.107284 -0.380735 -0.064337 0.107668 -0.620057 0.035001 -0.382359 0.067159 -0.070397 -0.241283 -0.173687 -0.617304 1.960375 -0.111923 -0.249393
4 0.987017 -0.826320 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.274526 -0.065941 -0.107284 -0.165541 -0.079361 0.463596 0.048575 0.033393 -0.000560 -0.319120 1.272128 -0.331136 -0.203912 -0.697091 0.126204 -0.138889 -0.270291
5 0.924503 -0.584105 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.251334 -0.065941 -0.107193 -0.079464 -0.074505 0.463596 1.724652 0.035201 -0.382359 -0.319120 1.272128 -0.235549 -0.200182 -0.577410 -1.678224 -0.131569 -0.250731
6 0.049307 -0.826320 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.245863 -0.065941 -0.107284 0.049653 -0.053410 0.463596 0.542597 0.035001 -0.000560 -0.319120 -0.070397 -0.107927 -0.175839 -0.657197 0.473209 -0.098294 -0.245427
7 -0.060092 -0.402443 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.209529 -0.065941 -0.107284 -0.409428 -0.065144 0.463596 -0.306325 0.035201 -0.764159 -0.319120 -0.070397 -0.242875 -0.187468 -0.377940 1.613370 -0.117747 -0.215296
8 -2.044911 1.535278 1.078282 0.016518 1.159830 -0.118447 -0.287603 0.473887 -0.065941 0.513220 -0.222926 -0.026561 -3.095683 -0.835364 0.035201 -0.382359 0.453439 -1.412921 -0.191814 -0.161404 2.815921 -0.534758 0.654095 0.569622
9 0.549419 -0.220782 1.078282 1.167467 -1.536780 -0.118447 -0.287603 -0.149803 -0.065941 -0.107193 0.178769 -0.055158 0.463596 0.667048 0.035201 -0.764159 -0.705400 -0.070397 -0.095962 -0.180143 -0.363923 -0.030774 -0.095734 -0.164990
In [213]:
x_test_new=pd.concat([x_test_new,df_test1],axis=1)
x_test_new.head(2)
Out[213]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT LAP STHL STLAP
0 -0.044464 0.626971 -0.676893 -0.622898 -0.614255 -0.118447 3.366490 -0.002305 -0.065941 -0.107284 0.566118 -0.053824 -2.027899 -0.450178 0.035201 -1.145959 -0.70540 -0.070397 -0.128890 -0.118697 0.198709 0.869787 -0.068570 -0.039478 1 0 0
1 0.533790 -0.886874 0.200694 0.655934 -0.969073 -0.118447 -0.287603 -0.259112 -0.065941 -0.107284 -0.280311 -0.056991 0.463596 0.481554 0.035201 -0.382359 -1.09168 -0.070397 -0.090971 -0.183111 -0.714189 -0.534758 -0.101512 -0.256752 1 0 0
In [214]:
x_train_new= pd.concat([x_train_new,x_new_copy],axis=1)
In [215]:
x_test_new.head(2)
Out[215]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT LAP STHL STLAP
0 -0.044464 0.626971 -0.676893 -0.622898 -0.614255 -0.118447 3.366490 -0.002305 -0.065941 -0.107284 0.566118 -0.053824 -2.027899 -0.450178 0.035201 -1.145959 -0.70540 -0.070397 -0.128890 -0.118697 0.198709 0.869787 -0.068570 -0.039478 1 0 0
1 0.533790 -0.886874 0.200694 0.655934 -0.969073 -0.118447 -0.287603 -0.259112 -0.065941 -0.107284 -0.280311 -0.056991 0.463596 0.481554 0.035201 -0.382359 -1.09168 -0.070397 -0.090971 -0.183111 -0.714189 -0.534758 -0.101512 -0.256752 1 0 0
In [216]:
x_train_new.head(2)
Out[216]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT LAP STHL STLAP
0 0.01805 -0.705212 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.262438 -0.065941 -0.107284 -0.122503 -0.075827 0.463596 -1.732552 0.013175 -0.764159 -0.31912 -0.070397 -0.329322 -0.199913 -0.577410 1.737300 -0.133733 -0.260036 0 0 1
1 0.12745 -0.523551 0.200694 -0.111365 -0.117511 -0.118447 -0.287603 -0.261256 -0.065941 -0.107284 -0.438120 -0.080292 -0.604188 -1.386642 0.035201 -0.382359 -0.70540 -0.070397 -0.345788 -0.173526 -0.476625 -0.237325 -0.142860 -0.258269 1 0 0

Clustering Analysis

KMEANS

In [217]:
from sklearn.cluster import KMeans
In [218]:
df_clusters=df_2.copy()
In [219]:
df_clusters.head()
Out[219]:
const BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT FORECLOSURE Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT LAP STHL STLAP PAID_AMOUNT
0 1.0 0 45 1 0.312138 -2.809242 0 1 8.614898e+06 175477.782641 2.135869e+05 0.60 1000000.0 5.0 40.06 175477.782641 1 1 120 1.164472e+05 7994.273589 1 100.000000 13.525980 389039.333333 0 0 0 1.431938e+07
1 1.0 99 38 0 0.000000 -3.121380 0 1 1.061903e+07 279448.084560 0.000000e+00 0.60 126530.0 11.0 84.31 279448.084560 0 0 180 1.234760e+07 100504.575864 1 27.737226 13.734072 126530.000000 0 0 0 1.188781e+07
2 1.0 231 81 2 0.624276 -4.057794 0 5 4.670211e+06 0.000000 1.179716e+06 0.72 22878.0 8.0 50.89 0.000000 2 3 180 2.994184e+06 5947.630536 1 25.961538 15.814992 181918.666667 0 0 0 3.929335e+06
3 1.0 0 91 4 2.497104 -0.624276 0 1 1.313098e+07 63659.280762 6.770848e+06 0.60 65741.0 5.0 84.63 63659.280762 3 8 180 4.295120e+04 60322.180776 1 100.000000 12.069336 543827.333333 0 0 0 1.462863e+07
4 1.0 215 89 2 0.624276 -4.057794 0 5 1.048923e+07 0.000000 1.202181e+05 0.83 54433.0 4.0 30.94 0.000000 0 3 180 7.200653e+06 27732.787464 1 29.276316 15.190716 54433.000000 0 0 0 7.149063e+06
In [220]:
df_clusters.drop(['const'],axis=1,inplace=True)
In [221]:
nbfc_arr=np.array(df_clusters)
In [222]:
wss=[]
for i in range(1,10):
    KM = KMeans(n_clusters=i)
    KM.fit(nbfc_arr)
    wss.append(KM.inertia_)
In [223]:
plt.plot(range(1,10), wss)
Out[223]:
[<matplotlib.lines.Line2D at 0x7daaec5fd0>]
In [224]:
k_means=KMeans(n_clusters=3)
k_means.fit(df_clusters)
labels=k_means.labels_
# Taking 3 clusters to take into account.
In [225]:
from sklearn.metrics import silhouette_samples,silhouette_score
In [226]:
silhouette_score(df_clusters,labels)
Out[226]:
0.9179579798100705
In [227]:
df_clusters['silhouette_samples']=silhouette_samples(df_clusters,labels)
In [228]:
df_clusters['Kmeans_Cluster'] = labels
In [229]:
df_clusters.head()
Out[229]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT FORECLOSURE Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT LAP STHL STLAP PAID_AMOUNT silhouette_samples Kmeans_Cluster
0 0 45 1 0.312138 -2.809242 0 1 8.614898e+06 175477.782641 2.135869e+05 0.60 1000000.0 5.0 40.06 175477.782641 1 1 120 1.164472e+05 7994.273589 1 100.000000 13.525980 389039.333333 0 0 0 1.431938e+07 0.815032 0
1 99 38 0 0.000000 -3.121380 0 1 1.061903e+07 279448.084560 0.000000e+00 0.60 126530.0 11.0 84.31 279448.084560 0 0 180 1.234760e+07 100504.575864 1 27.737226 13.734072 126530.000000 0 0 0 1.188781e+07 0.783450 0
2 231 81 2 0.624276 -4.057794 0 5 4.670211e+06 0.000000 1.179716e+06 0.72 22878.0 8.0 50.89 0.000000 2 3 180 2.994184e+06 5947.630536 1 25.961538 15.814992 181918.666667 0 0 0 3.929335e+06 0.924518 0
3 0 91 4 2.497104 -0.624276 0 1 1.313098e+07 63659.280762 6.770848e+06 0.60 65741.0 5.0 84.63 63659.280762 3 8 180 4.295120e+04 60322.180776 1 100.000000 12.069336 543827.333333 0 0 0 1.462863e+07 0.770884 0
4 215 89 2 0.624276 -4.057794 0 5 1.048923e+07 0.000000 1.202181e+05 0.83 54433.0 4.0 30.94 0.000000 0 3 180 7.200653e+06 27732.787464 1 29.276316 15.190716 54433.000000 0 0 0 7.149063e+06 0.849797 0
In [230]:
df_clusters['Kmeans_Cluster'].value_counts()
Out[230]:
0    19216
2      273
1        1
Name: Kmeans_Cluster, dtype: int64
In [231]:
df_clusters['Customer_ID'] = df_1['CUSTOMERID']
In [232]:
df_clusters[df_clusters['Kmeans_Cluster'] == 0].describe()
Out[232]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT FORECLOSURE Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT LAP STHL STLAP PAID_AMOUNT silhouette_samples Kmeans_Cluster Customer_ID
count 19216.000000 19216.000000 19216.000000 19216.000000 19216.000000 19216.000000 19216.000000 1.921600e+04 1.921600e+04 1.921600e+04 19216.000000 1.921600e+04 19216.000000 19216.000000 1.921600e+04 19216.000000 19216.000000 19216.000000 1.921600e+04 1.921600e+04 19216.000000 19216.000000 19216.000000 1.921600e+04 19216.000000 19216.000000 19216.000000 1.921600e+04 19216.000000 19216.0 1.921600e+04
mean 173.347419 17.416164 0.762438 0.598675 -0.386531 6.764675 5.756141 1.498578e+06 2.201661e+04 2.458336e+05 0.598874 6.520614e+04 10.728768 50.951920 -4.510670e+04 3.004319 2.792361 183.514311 4.269460e+06 4.574747e+04 0.089665 11.211818 14.655713 4.661139e+04 0.305579 0.363083 0.156484 1.208504e+06 0.930268 0.0 1.201752e+07
std 63.514866 16.283820 1.132826 0.937395 0.873130 57.807013 2.687596 3.079201e+06 3.105946e+05 1.827168e+06 0.637246 4.596975e+05 2.786107 21.124902 1.172609e+06 2.620230 2.535714 44.214875 5.651187e+06 1.980600e+05 0.285708 13.783543 2.521077 2.186626e+05 0.460664 0.480901 0.363323 2.763049e+06 0.075837 0.0 9.464887e+03
min 0.000000 0.000000 0.000000 0.000000 -7.179174 0.000000 1.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 1.000000e+00 1.000000 0.380000 -7.482920e+07 0.000000 0.000000 14.000000 -7.506479e-01 0.000000e+00 0.000000 0.000000 6.958596 1.334000e+01 0.000000 0.000000 0.000000 0.000000e+00 0.133395 0.0 1.200100e+07
25% 137.000000 6.000000 0.000000 0.000000 -1.186124 0.000000 5.000000 2.155171e+05 0.000000e+00 0.000000e+00 0.410000 1.107325e+04 12.000000 34.880000 -2.208552e+01 2.000000 1.000000 180.000000 1.423297e+06 4.909059e+03 0.000000 2.916667 12.797658 8.818833e+03 0.000000 0.000000 0.000000 1.681215e+05 0.943525 0.0 1.200939e+07
50% 174.000000 12.000000 0.000000 0.000000 0.000000 0.000000 5.000000 5.484022e+05 0.000000e+00 0.000000e+00 0.520000 1.952150e+04 12.000000 52.895000 0.000000e+00 2.000000 2.000000 180.000000 2.368055e+06 1.073798e+04 0.000000 6.250000 14.275111 1.533235e+04 0.000000 0.000000 0.000000 4.423365e+05 0.954771 0.0 1.201753e+07
75% 216.000000 25.000000 2.000000 1.186124 0.000000 0.000000 5.000000 1.432404e+06 0.000000e+00 2.606091e+02 0.680000 3.731075e+04 12.000000 66.620000 0.000000e+00 4.000000 3.000000 228.000000 4.439330e+06 3.095783e+04 0.000000 14.583333 16.231176 3.187600e+04 1.000000 1.000000 0.000000 1.065342e+06 0.957552 0.0 1.202571e+07
max 674.000000 98.000000 9.000000 24.346764 8.458187 1658.000000 15.000000 6.339958e+07 2.137741e+07 8.352094e+07 33.220000 2.444927e+07 12.000000 100.000000 1.483567e+07 33.000000 30.000000 300.000000 5.254535e+07 1.023971e+07 1.000000 100.000000 24.513238 1.151328e+07 1.000000 1.000000 1.000000 7.166709e+07 0.959443 0.0 1.203390e+07
In [233]:
df_clusters[df_clusters['Kmeans_Cluster'] == 2].describe()
Out[233]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT FORECLOSURE Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT LAP STHL STLAP PAID_AMOUNT silhouette_samples Kmeans_Cluster Customer_ID
count 273.000000 273.000000 273.000000 273.000000 273.000000 273.000000 273.000000 2.730000e+02 2.730000e+02 2.730000e+02 273.000000 2.730000e+02 273.000000 273.000000 2.730000e+02 273.000000 273.000000 273.000000 2.730000e+02 2.730000e+02 273.000000 273.000000 273.000000 2.730000e+02 273.000000 273.0 273.0 2.730000e+02 273.000000 273.0 2.730000e+02
mean 132.996337 27.329670 1.608059 1.484771 -0.725979 23.575092 7.014652 3.857527e+07 6.923021e+05 5.211076e+06 0.903927 1.228654e+06 9.542125 53.597106 -2.402127e+05 2.890110 3.765568 149.106227 7.032995e+07 7.919371e+05 0.065934 21.207513 14.845451 7.990969e+05 0.868132 0.0 0.0 2.969082e+07 0.054834 2.0 1.201190e+07
std 69.319088 22.769976 1.407781 2.132547 1.494024 162.050654 4.018313 3.719474e+07 4.693352e+06 1.795447e+07 0.983696 5.586326e+06 3.660082 15.231350 5.726008e+06 2.399987 3.582533 48.805659 5.417356e+07 1.720176e+06 0.248623 21.317127 1.976823 1.956863e+06 0.338969 0.0 0.0 3.055447e+07 0.206399 0.0 9.347972e+03
min 0.000000 0.000000 0.000000 0.000000 -5.306346 0.000000 5.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 2.300000e+02 1.000000 6.740000 -7.534554e+07 0.000000 0.000000 36.000000 0.000000e+00 0.000000e+00 0.000000 0.000000 8.011542 2.300000e+02 0.000000 0.0 0.0 0.000000e+00 -0.327045 2.0 1.200103e+07
25% 81.000000 10.000000 0.000000 0.000000 -1.747973 0.000000 5.000000 1.480990e+07 0.000000e+00 0.000000e+00 0.610000 3.667090e+05 7.000000 46.440000 -1.347217e+00 1.000000 2.000000 120.000000 4.096915e+07 1.276830e+05 0.000000 6.410256 13.546789 2.999227e+05 1.000000 0.0 0.0 1.083680e+07 -0.134685 2.0 1.200281e+07
50% 130.000000 20.000000 2.000000 1.747973 -0.936414 0.000000 5.000000 2.678167e+07 0.000000e+00 2.606091e+02 0.780000 5.430140e+05 12.000000 57.080000 0.000000e+00 2.000000 3.000000 180.000000 5.446071e+07 2.836245e+05 0.000000 13.653137 14.483203 4.381870e+05 1.000000 0.0 0.0 1.933157e+07 0.089298 2.0 1.200931e+07
75% 180.000000 40.000000 2.000000 1.747973 0.000000 0.000000 5.000000 5.098856e+07 0.000000e+00 1.087738e+06 0.990000 8.498820e+05 12.000000 64.850000 0.000000e+00 4.000000 4.000000 180.000000 8.120618e+07 6.992121e+05 0.000000 30.530973 16.085512 7.202373e+05 1.000000 0.0 0.0 3.712983e+07 0.247908 2.0 1.201894e+07
max 408.000000 96.000000 7.000000 24.346764 7.881109 1790.000000 15.000000 3.546104e+08 5.899531e+07 1.867143e+08 11.850000 8.496881e+07 12.000000 85.140000 3.864350e+07 18.000000 23.000000 240.000000 3.818367e+08 1.710689e+07 1.000000 100.000000 24.138672 2.872427e+07 1.000000 0.0 0.0 2.425390e+08 0.358996 2.0 1.203389e+07
In [234]:
# Customers belongs to cluster 2 has better financials. They are those customers which has lesser Days past dues ,Due Day,
# EMI_DUEAMT ,EMI_OS_AMOUNT,FOIR,NET_LTV,OUTSTANDING_PRINCIPAL . They are the ones which has lesser probabilites to 
# get foreclosed

# Customers belongs to cluster 0 has poor financials. They are those customers which has more number of Days past dues ,
# Due Day, EMI_DUEAMT ,EMI_OS_AMOUNT,FOIR,NET_LTV,OUTSTANDING_PRINCIPAL .They are the ones which has higher probabilites to 
# get foreclosed. Hence NBFC should give high focus to these customers

Machine Learning

Logistic Regression

In [235]:
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression
In [236]:
log_modelCV=LogisticRegressionCV()
In [237]:
log_modelCV=log_modelCV.fit(x_train_new,y_train_new)
In [238]:
log_modelCV.score(x_train_new,y_train_new)
Out[238]:
0.9649637176574067
In [239]:
log_modelCV.score(x_test_new,y_test_new)
Out[239]:
0.9628869505729434
In [240]:
log_model=LogisticRegression()
log_model=log_model.fit(x_train_new,y_train_new)
In [241]:
log_model.score(x_train_new,y_train_new)
Out[241]:
0.963131276112292
In [242]:
log_model.score(x_test_new,y_test_new)
Out[242]:
0.9610056439199589
In [243]:
from sklearn.metrics import classification_report,confusion_matrix
In [244]:
def confusionmatrix(y_actual,y_predict):
    sns.heatmap(confusion_matrix(y_actual,y_predict),annot=True,fmt='g')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
In [245]:
def class_report(y_actual,y_predict):
    print(classification_report(y_actual,y_predict))
In [246]:
confusionmatrix(y_train_new,log_model.predict(x_train_new))
In [247]:
class_report(y_train_new,log_model.predict(x_train_new))
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     12386
           1       0.87      0.70      0.78      1257

    accuracy                           0.96     13643
   macro avg       0.92      0.85      0.88     13643
weighted avg       0.96      0.96      0.96     13643

In [248]:
confusionmatrix(y_test_new,log_model.predict(x_test_new))
In [249]:
class_report(y_test_new,log_model.predict(x_test_new))
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      5363
           1       0.80      0.70      0.75       484

    accuracy                           0.96      5847
   macro avg       0.89      0.84      0.86      5847
weighted avg       0.96      0.96      0.96      5847

In [250]:
confusionmatrix(y_train_new,log_modelCV.predict(x_train_new))

class_report(y_train_new,log_modelCV.predict(x_train_new))

confusionmatrix(y_test_new,log_modelCV.predict(x_test_new))

class_report(y_test_new,log_modelCV.predict(x_test_new))
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     12386
           1       0.87      0.73      0.79      1257

    accuracy                           0.96     13643
   macro avg       0.92      0.86      0.89     13643
weighted avg       0.96      0.96      0.96     13643

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5363
           1       0.80      0.73      0.76       484

    accuracy                           0.96      5847
   macro avg       0.89      0.86      0.87      5847
weighted avg       0.96      0.96      0.96      5847

In [251]:
y_train_new.value_counts()
Out[251]:
0    12386
1     1257
Name: FORECLOSURE, dtype: int64
In [252]:
import imblearn
In [253]:
from imblearn.over_sampling import SMOTE
In [254]:
sm = SMOTE(random_state=0)
x_train_smote, y_train_smote = sm.fit_resample(x_train_new, y_train_new)
In [255]:
x_train.info()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-255-cfcea75e76a5> in <module>
----> 1 x_train.info()

NameError: name 'x_train' is not defined
In [256]:
y_train_smote.value_counts()
Out[256]:
1    12386
0    12386
Name: FORECLOSURE, dtype: int64
In [257]:
log_modelCV_smote=LogisticRegressionCV()

log_modelCV_smote=log_modelCV_smote.fit(x_train_smote,y_train_smote)

print(log_modelCV_smote.score(x_train_smote,y_train_smote))
0.9521637332472146
In [258]:
print(log_modelCV_smote.score(x_test_new,y_test_new))
0.9442449119206431
In [259]:
class_report(y_train_new,log_modelCV_smote.predict(x_train_new))
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     12386
           1       0.66      0.94      0.77      1257

    accuracy                           0.95     13643
   macro avg       0.83      0.95      0.87     13643
weighted avg       0.96      0.95      0.95     13643

In [260]:
class_report(y_test_new,log_modelCV_smote.predict(x_test_new))
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      5363
           1       0.61      0.93      0.73       484

    accuracy                           0.94      5847
   macro avg       0.80      0.94      0.85      5847
weighted avg       0.96      0.94      0.95      5847

In [261]:
confusionmatrix(y_train_new,log_modelCV_smote.predict(x_train_new))
In [262]:
confusionmatrix(y_test_new,log_modelCV_smote.predict(x_test_new))

Defining a function to plot roc_curve and calculate roc_auc_score

In [263]:
from sklearn.metrics import roc_auc_score,roc_curve
In [264]:
def roc_plot(model_name,x_data,y_lables):
    probs1=model_name.predict_proba(x_data)
    probs1=probs1[:,1]
    print('roc_auc_score -->',roc_auc_score(y_lables,probs1))
    fpr,tpr,thresholds=roc_curve(y_lables,probs1)
    plt.plot([0,1],[0,1],linestyle='--')
    plt.plot(fpr,tpr,marker='.')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.title('ROC')
    plt.show()

#model_name is the name of the model, x_data is the data that needs to be passed and y_labels is the class values
In [265]:
#Graphical representation of the contrast between true positive rates and false-positive rates at various thresholds.
In [266]:
roc_plot(log_modelCV_smote,x_train_new,y_train_new)
roc_auc_score --> 0.9843644523335235
In [267]:
roc_plot(log_modelCV_smote,x_test_new,y_test_new)
roc_auc_score --> 0.9726080752261825
In [268]:
log_modelCV_smote.coef_[0]
Out[268]:
array([ -0.64228475,   0.63752446,  -0.60557819,  -0.60538527,
         0.77782614,  -0.99673254,   0.17830712,   1.87065632,
         0.9511264 ,   1.01685495,   0.3184325 ,  -0.20595776,
        -2.61748683,  -0.19860158,  -2.12093277,   0.16591918,
        -0.37773812,   0.3776597 ,  -0.09714228,  -0.4138838 ,
        -0.83446489,   1.83287231,  -0.35054752,  -1.04792052,
       -18.8139461 ,  -4.46458125, -19.63031119])
In [269]:
plt.figure(figsize=(10,8))
sns.barplot(x=log_modelCV_smote.coef_[0] ,y=x_test_new.columns)
Out[269]:
<matplotlib.axes._subplots.AxesSubplot at 0x7dab90edd8>
In [270]:
roc_plot(log_modelCV,x_train_new,y_train_new)
roc_plot(log_modelCV,x_test_new,y_test_new)
roc_auc_score --> 0.9830924539356609
roc_auc_score --> 0.9731747834488838

Support Vector Machine

In [271]:
from sklearn.svm import SVC
In [272]:
svc_smote=SVC(probability=True)
In [273]:
svc_smote=svc_smote.fit(x_train_smote,y_train_smote)
In [274]:
class_report(y_train_new,svc_smote.predict(x_train_new))
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     12386
           1       0.88      0.97      0.93      1257

    accuracy                           0.99     13643
   macro avg       0.94      0.98      0.96     13643
weighted avg       0.99      0.99      0.99     13643

In [275]:
class_report(y_test_new,svc_smote.predict(x_test_new))
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5363
           1       0.86      0.87      0.87       484

    accuracy                           0.98      5847
   macro avg       0.93      0.93      0.93      5847
weighted avg       0.98      0.98      0.98      5847

In [276]:
confusionmatrix(y_train_new,svc_smote.predict(x_train_new))
In [277]:
confusionmatrix(y_test_new,svc_smote.predict(x_test_new))
In [278]:
roc_plot(svc_smote,x_train_new,y_train_new)
roc_auc_score --> 0.9959240043259764
In [279]:
roc_plot(svc_smote,x_test_new,y_test_new)
roc_auc_score --> 0.9818397175011518
In [280]:
svc_smote_nl=SVC(kernel='poly',degree=3,probability=True)
In [281]:
svc_smote_nl=svc_smote_nl.fit(x_train_smote,y_train_smote)
In [282]:
class_report(y_train_new,svc_smote_nl.predict(x_train_new))
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     12386
           1       0.86      0.96      0.91      1257

    accuracy                           0.98     13643
   macro avg       0.93      0.97      0.95     13643
weighted avg       0.98      0.98      0.98     13643

In [283]:
class_report(y_test_new,svc_smote_nl.predict(x_test_new))
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      5363
           1       0.79      0.91      0.85       484

    accuracy                           0.97      5847
   macro avg       0.89      0.94      0.92      5847
weighted avg       0.98      0.97      0.97      5847

In [284]:
confusionmatrix(y_train_new,svc_smote_nl.predict(x_train_new))
In [285]:
confusionmatrix(y_test_new,svc_smote_nl.predict(x_test_new))
In [286]:
roc_plot(svc_smote_nl,x_train_new,y_train_new)
roc_auc_score --> 0.9953837389995968
In [287]:
roc_plot(svc_smote_nl,x_test_new,y_test_new)
roc_auc_score --> 0.9585366830887486
In [288]:
#WITHOUT USING SMOTE
In [289]:
svc_normal=SVC(probability=True)
svc_normal=svc_normal.fit(x_train_new,y_train_new)
In [290]:
class_report(y_test_new,svc_normal.predict(x_test_new))
confusionmatrix(y_test_new,svc_normal.predict(x_test_new))
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5363
           1       0.93      0.81      0.86       484

    accuracy                           0.98      5847
   macro avg       0.96      0.90      0.93      5847
weighted avg       0.98      0.98      0.98      5847

In [291]:
class_report(y_train_new,svc_normal.predict(x_train_new))
confusionmatrix(y_train_new,svc_normal.predict(x_train_new))
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     12386
           1       0.94      0.89      0.92      1257

    accuracy                           0.98     13643
   macro avg       0.97      0.94      0.95     13643
weighted avg       0.98      0.98      0.98     13643

In [292]:
roc_plot(svc_normal,x_test_new,y_test_new)
roc_auc_score --> 0.9924130058573976
In [293]:
roc_plot(svc_normal,x_train_new,y_train_new)
roc_auc_score --> 0.9947549013751635

Naive Bayes

In [294]:
from sklearn.naive_bayes import GaussianNB
In [295]:
gnb=GaussianNB()
In [296]:
gnb=gnb.fit(x_train_smote,y_train_smote)
In [297]:
class_report(y_train_new,gnb.predict(x_train_new))
              precision    recall  f1-score   support

           0       1.00      0.52      0.68     12386
           1       0.17      0.99      0.29      1257

    accuracy                           0.56     13643
   macro avg       0.59      0.75      0.49     13643
weighted avg       0.92      0.56      0.65     13643

In [298]:
confusionmatrix(y_train_new,gnb.predict(x_train_new))
In [299]:
class_report(y_test_new,gnb.predict(x_test_new))
              precision    recall  f1-score   support

           0       0.99      0.53      0.69      5363
           1       0.15      0.95      0.26       484

    accuracy                           0.56      5847
   macro avg       0.57      0.74      0.48      5847
weighted avg       0.92      0.56      0.65      5847

In [300]:
confusionmatrix(y_test_new,gnb.predict(x_test_new))
In [301]:
roc_plot(gnb,x_train_new,y_train_new)
roc_auc_score --> 0.9368847870301894
In [302]:
roc_plot(gnb,x_test_new,y_test_new)
roc_auc_score --> 0.9107690742969505
In [303]:
gnb_normal=GaussianNB()
gnb_normal=gnb_normal.fit(x_train_new,y_train_new)
In [304]:
class_report(y_test_new,gnb_normal.predict(x_test_new))
confusionmatrix(y_test_new,gnb_normal.predict(x_test_new))
              precision    recall  f1-score   support

           0       1.00      0.51      0.68      5363
           1       0.16      0.99      0.27       484

    accuracy                           0.55      5847
   macro avg       0.58      0.75      0.47      5847
weighted avg       0.93      0.55      0.65      5847

In [305]:
class_report(y_train_new,gnb_normal.predict(x_train_new))
confusionmatrix(y_train_new,gnb_normal.predict(x_train_new))
              precision    recall  f1-score   support

           0       1.00      0.51      0.68     12386
           1       0.17      1.00      0.29      1257

    accuracy                           0.56     13643
   macro avg       0.59      0.76      0.49     13643
weighted avg       0.92      0.56      0.64     13643

In [306]:
roc_plot(gnb_normal,x_test_new,y_test_new)
roc_plot(gnb_normal,x_train_new,y_train_new)
roc_auc_score --> 0.9321132861680045
roc_auc_score --> 0.9479853238463988

Random Forest

In [307]:
from sklearn.ensemble import RandomForestClassifier
In [308]:
rfc=RandomForestClassifier(n_estimators=1000,
    criterion='gini',
    max_depth=None,
    min_samples_split=100,
    min_samples_leaf=50,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=0,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,)
In [309]:
rfc=rfc.fit(x_train_smote,y_train_smote)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-309-1431f9be46fe> in <module>
----> 1 rfc=rfc.fit(x_train_smote,y_train_smote)

~\Anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in fit(self, X, y, sample_weight)
    390                     verbose=self.verbose, class_weight=self.class_weight,
    391                     n_samples_bootstrap=n_samples_bootstrap)
--> 392                 for i, t in enumerate(trees))
    393 
    394             # Collect newly grown trees

~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
    922                 self._iterating = self._original_iterator is not None
    923 
--> 924             while self.dispatch_one_batch(iterator):
    925                 pass
    926 

~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761 

~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550 
    551     def get(self):

~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~\Anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose, class_weight, n_samples_bootstrap)
    166                                                         indices=indices)
    167 
--> 168         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    169     else:
    170         tree.fit(X, y, sample_weight=sample_weight, check_input=False)

~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    892             sample_weight=sample_weight,
    893             check_input=check_input,
--> 894             X_idx_sorted=X_idx_sorted)
    895         return self
    896 

~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    373                                            min_impurity_split)
    374 
--> 375         builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)
    376 
    377         if self.n_outputs_ == 1 and is_classifier(self):

KeyboardInterrupt: 
In [ ]:
class_report(y_train_new,rfc.predict(x_train_new))
In [ ]:
class_report(y_test_new,rfc.predict(x_test_new))
In [ ]:
from sklearn.model_selection import GridSearchCV
In [ ]:
param_grid = {
    'min_samples_split' : [50,70,100],
    'min_samples_leaf':[40,50,60],
    'max_depth':[10,15,20],
    'random_state' : [0]
}
In [ ]:
RF_model=RandomForestClassifier()
In [ ]:
grid_search=GridSearchCV(estimator=RF_model,param_grid=param_grid,cv=10)
In [ ]:
grid_search.fit(x_train_smote,y_train_smote)
In [1587]:
grid_search.best_estimator_
Out[1587]:
RandomForestClassifier(max_depth=15, min_samples_leaf=40, min_samples_split=50,
                       random_state=0)
In [310]:
RF_model=RandomForestClassifier(max_depth=15, min_samples_leaf=40, min_samples_split=50,
                       random_state=0)
In [311]:
RF_model.fit(x_train_smote, y_train_smote)
Out[311]:
RandomForestClassifier(max_depth=15, min_samples_leaf=40, min_samples_split=50,
                       random_state=0)
In [312]:
## Performance Matrix on train data set
y_train_predict = RF_model.predict(x_train_new)
model_score =RF_model.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
0.9774976178259913
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     12386
           1       0.83      0.95      0.89      1257

    accuracy                           0.98     13643
   macro avg       0.91      0.97      0.94     13643
weighted avg       0.98      0.98      0.98     13643

In [313]:
print('Scores and Curve for Training data is ')
roc_plot(RF_model,x_train_new,y_train_new)
Scores and Curve for Training data is 
roc_auc_score --> 0.995673895168166
In [314]:
## Performance Matrix on test data set
y_test_predict = RF_model.predict(x_test_new)
model_score = RF_model.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test_new, y_test_predict)
class_report(y_test_new, y_test_predict)
0.9774243201641868
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5363
           1       0.82      0.93      0.87       484

    accuracy                           0.98      5847
   macro avg       0.91      0.96      0.93      5847
weighted avg       0.98      0.98      0.98      5847

In [315]:
print('Scores and Curve for Testing data is ')
roc_plot(RF_model,x_test_new,y_test_new)
Scores and Curve for Testing data is 
roc_auc_score --> 0.9935997799430749
In [316]:
pd.DataFrame(100*(RF_model.feature_importances_),x_train_new.columns)
Out[316]:
0
BALANCE_TENURE 0.611439
COMPLETED_TENURE 3.779938
CURRENT_INTEREST_RATE_CHANGES 2.023371
DIFF_CURRENT_INTEREST_RATE_MAX_MIN 3.666159
DIFF_ORIGINAL_CURRENT_INTEREST_RATE 3.117104
DPD 0.167769
DUEDAY 1.115934
EMI_DUEAMT 4.584510
EMI_OS_AMOUNT 0.062604
EXCESS_ADJUSTED_AMT 1.027064
FOIR 0.321275
LAST_RECEIPT_AMOUNT 0.418426
LATEST_TRANSACTION_MONTH 40.654435
NET_LTV 0.180694
NET_RECEIVABLE 0.153690
NUM_EMI_CHANGES 0.913713
NUM_LOW_FREQ_TRANSACTIONS 0.569462
ORIGNAL_TENOR 1.203198
OUTSTANDING_PRINCIPAL 0.423151
PRE_EMI_DUEAMT 0.807983
Percentage_Completion 2.011967
Mean_Interest_Rate 4.519920
MEAN_EMI_AMOUNT 0.800630
PAID_AMOUNT 4.132782
LAP 16.212506
STHL 0.657010
STLAP 5.863266
In [317]:
plt.figure(figsize=(10,8))
sns.barplot(y=x_train_new.columns,x=100*(RF_model.feature_importances_))
Out[317]:
<matplotlib.axes._subplots.AxesSubplot at 0x7dbcb24c18>
In [318]:
RF_model_normal=RandomForestClassifier(max_depth=15, min_samples_leaf=40, min_samples_split=50,
                       random_state=0)
RF_model_normal=RF_model_normal.fit(x_train_new,y_train_new)
In [319]:
class_report(y_train_new,RF_model_normal.predict(x_train_new))

confusionmatrix(y_train_new,RF_model_normal.predict(x_train_new))
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     12386
           1       0.92      0.85      0.88      1257

    accuracy                           0.98     13643
   macro avg       0.95      0.92      0.93     13643
weighted avg       0.98      0.98      0.98     13643

In [320]:
class_report(y_test_new,RF_model_normal.predict(x_test_new))

confusionmatrix(y_test_new,RF_model_normal.predict(x_test_new))
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5363
           1       0.90      0.81      0.85       484

    accuracy                           0.98      5847
   macro avg       0.94      0.90      0.92      5847
weighted avg       0.98      0.98      0.98      5847

In [321]:
roc_plot(RF_model_normal,x_test_new,y_test_new)

roc_plot(RF_model_normal,x_train_new,y_train_new)
roc_auc_score --> 0.9924802326316066
roc_auc_score --> 0.993993012615547

Ada-boosting - Using Cross Validation

In [322]:
from sklearn.ensemble import AdaBoostClassifier
In [1596]:
param_grid = {
    'n_estimators' : [100,500,1000],
    'learning_rate' : [0.1,0.01,0.001],
    'algorithm' : ['SAMME', 'SAMME.R']
}
In [1597]:
ADB_model=AdaBoostClassifier()
In [1598]:
grid_search=GridSearchCV(estimator=ADB_model,param_grid=param_grid)
In [1599]:
grid_search.fit(x_train_smote,y_train_smote)
Out[1599]:
GridSearchCV(estimator=AdaBoostClassifier(),
             param_grid={'algorithm': ['SAMME', 'SAMME.R'],
                         'learning_rate': [0.1, 0.01, 0.001],
                         'n_estimators': [100, 500, 1000]})
In [1644]:
grid_search.best_estimator_
Out[1644]:
AdaBoostClassifier(learning_rate=0.1, n_estimators=1000)
In [323]:
ADB_model=AdaBoostClassifier(learning_rate=0.1, n_estimators=1000)
In [324]:
ADB_model
Out[324]:
AdaBoostClassifier(learning_rate=0.1, n_estimators=1000)
In [325]:
ADB_model.fit(x_train_new,y_train_new)
Out[325]:
AdaBoostClassifier(learning_rate=0.1, n_estimators=1000)
In [326]:
## Performance Matrix on train data set
y_train_predict = ADB_model.predict(x_train_new)
model_score = ADB_model.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
0.98130909623983
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     12386
           1       0.92      0.87      0.90      1257

    accuracy                           0.98     13643
   macro avg       0.96      0.93      0.94     13643
weighted avg       0.98      0.98      0.98     13643

In [327]:
## Performance Matrix on test data set
y_train_predict = ADB_model.predict(x_test_new)
model_score = ADB_model.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test_new, ADB_model.predict(x_test_new))
class_report(y_test_new, ADB_model.predict(x_test_new))
0.9813579613476997
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5363
           1       0.92      0.85      0.88       484

    accuracy                           0.98      5847
   macro avg       0.95      0.92      0.94      5847
weighted avg       0.98      0.98      0.98      5847

In [328]:
roc_plot(ADB_model,x_train_new,y_train_new)
roc_auc_score --> 0.9968747274266208
In [329]:
roc_plot(ADB_model,x_test_new,y_test_new)
roc_auc_score --> 0.9929240449175017
In [330]:
pd.DataFrame(100*(ADB_model.feature_importances_),x_train_new.columns)
Out[330]:
0
BALANCE_TENURE 4.2
COMPLETED_TENURE 1.7
CURRENT_INTEREST_RATE_CHANGES 1.8
DIFF_CURRENT_INTEREST_RATE_MAX_MIN 2.5
DIFF_ORIGINAL_CURRENT_INTEREST_RATE 6.2
DPD 0.4
DUEDAY 1.3
EMI_DUEAMT 2.1
EMI_OS_AMOUNT 0.3
EXCESS_ADJUSTED_AMT 1.5
FOIR 3.9
LAST_RECEIPT_AMOUNT 2.9
LATEST_TRANSACTION_MONTH 7.8
NET_LTV 4.1
NET_RECEIVABLE 4.6
NUM_EMI_CHANGES 0.0
NUM_LOW_FREQ_TRANSACTIONS 3.1
ORIGNAL_TENOR 0.7
OUTSTANDING_PRINCIPAL 5.2
PRE_EMI_DUEAMT 5.7
Percentage_Completion 3.3
Mean_Interest_Rate 13.7
MEAN_EMI_AMOUNT 1.2
PAID_AMOUNT 6.7
LAP 8.1
STHL 6.5
STLAP 0.5
In [331]:
ADB_model_normal=AdaBoostClassifier(learning_rate=0.1, n_estimators=1000)
ADB_model_normal=ADB_model_normal.fit(x_train_new,y_train_new)
In [332]:
class_report(y_train_new,RF_model_normal.predict(x_train_new))
confusionmatrix(y_train_new,RF_model_normal.predict(x_train_new))
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     12386
           1       0.92      0.85      0.88      1257

    accuracy                           0.98     13643
   macro avg       0.95      0.92      0.93     13643
weighted avg       0.98      0.98      0.98     13643

In [333]:
class_report(y_test_new,ADB_model_normal.predict(x_test_new))

confusionmatrix(y_test_new,ADB_model_normal.predict(x_test_new))
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5363
           1       0.92      0.85      0.88       484

    accuracy                           0.98      5847
   macro avg       0.95      0.92      0.94      5847
weighted avg       0.98      0.98      0.98      5847

In [336]:
roc_plot(ADB_model_normal,x_train_new,y_train_new)

roc_plot(ADB_model_normal,x_test_new,y_test_new)
roc_auc_score --> 0.9968747274266208
roc_auc_score --> 0.9929240449175017
In [337]:
y_train_new.value_counts(normalize=True)
Out[337]:
0    0.907865
1    0.092135
Name: FORECLOSURE, dtype: float64

KNN

In [338]:
from sklearn.neighbors import KNeighborsClassifier
In [339]:
KNN=KNeighborsClassifier()
KNN=KNN.fit(x_train_smote,y_train_smote)
In [340]:
## Performance Matrix on train data set
y_train_predict = KNN.predict(x_train_new)
model_score = KNN.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
0.9703877446309462
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     12386
           1       0.76      1.00      0.86      1257

    accuracy                           0.97     13643
   macro avg       0.88      0.98      0.92     13643
weighted avg       0.98      0.97      0.97     13643

In [345]:
## Performance Matrix on test data set
y_train_predict = KNN.predict(x_test_new)
model_score = KNN.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test_new, KNN.predict(x_test_new))
class_report(y_test_new, KNN.predict(x_test_new))
0.9521121942876689
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      5363
           1       0.66      0.88      0.75       484

    accuracy                           0.95      5847
   macro avg       0.82      0.92      0.86      5847
weighted avg       0.96      0.95      0.96      5847

In [346]:
roc_plot(KNN,x_train_new,y_train_new)
roc_auc_score --> 0.9998507951788409
In [347]:
roc_plot(KNN,x_test_new,y_test_new)
roc_auc_score --> 0.9543936645796188
In [348]:
KNN_normal=KNeighborsClassifier()
KNN_normal=KNN_normal.fit(x_train_new,y_train_new)
In [349]:
## Performance Matrix on train data set
y_train_predict = KNN_normal.predict(x_train_new)
model_score = KNN_normal.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
0.9742725207065894
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     12386
           1       0.93      0.78      0.85      1257

    accuracy                           0.97     13643
   macro avg       0.95      0.89      0.92     13643
weighted avg       0.97      0.97      0.97     13643

In [350]:
## Performance Matrix on test data set
y_train_predict = KNN_normal.predict(x_test_new)
model_score = KNN_normal.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test, KNN_normal.predict(x_test_new))
class_report(y_test, KNN_normal.predict(x_test_new))
0.969557037797161
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-350-6144d5ff3cf2> in <module>
      3 model_score = KNN_normal.score(x_test_new, y_test_new)
      4 print(model_score)
----> 5 confusionmatrix(y_test, KNN_normal.predict(x_test_new))
      6 class_report(y_test, KNN_normal.predict(x_test_new))

NameError: name 'y_test' is not defined
In [ ]:
roc_plot(KNN_normal,x_train_new,y_train_new)
In [ ]:
roc_plot(KNN_normal,x_test_new,y_test_new)

XGBoost

In [ ]:
import xgboost as xgb
In [1620]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
In [1621]:
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch1.fit(x_train_smote, y_train_smote)
Out[1621]:
GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=5, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=140, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=1, seed=27, subsample=0.8,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             iid=False, n_jobs=4,
             param_grid={'max_depth': range(3, 10, 2),
                         'min_child_weight': range(1, 6, 2)},
             scoring='roc_auc')
In [1622]:
gsearch1.best_params_,gsearch1.best_score_
Out[1622]:
({'max_depth': 9, 'min_child_weight': 1}, 0.9993562137101512)
In [529]:
for i in range(8,12,1):
    print(i)
8
9
10
11
In [1623]:
param_test2 = {
 'max_depth':range(8,12,1),
 'min_child_weight':range(1,4,1)
}
In [1624]:
gsearch2 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=9,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch2.fit(x_train_smote, y_train_smote)
Out[1624]:
GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=9, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=140, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=1, seed=27, subsample=0.8,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             iid=False, n_jobs=4,
             param_grid={'max_depth': range(8, 12),
                         'min_child_weight': range(1, 4)},
             scoring='roc_auc')
In [1625]:
gsearch2.best_params_,gsearch2.best_score_
Out[1625]:
({'max_depth': 10, 'min_child_weight': 1}, 0.999389831669234)
In [533]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
In [534]:
gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=11,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(x_train_smote, y_train_smote)
Out[534]:
GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=11, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=140, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=1, seed=27, subsample=0.8,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             iid=False, n_jobs=4,
             param_grid={'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]},
             scoring='roc_auc')
In [535]:
gsearch3.best_params_,gsearch3.best_score_
Out[535]:
({'gamma': 0.0}, 0.99907841867544)
In [1310]:
xgb2 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=11,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
In [1311]:
xgb2.fit(x_train_smote, y_train_smote)
Out[1311]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=11,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=27, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=27, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbosity=None)
In [1312]:
class_report(y_train_new,xgb2.predict(x_train_new))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12386
           1       1.00      1.00      1.00      1257

    accuracy                           1.00     13643
   macro avg       1.00      1.00      1.00     13643
weighted avg       1.00      1.00      1.00     13643

In [1313]:
class_report(y_test_new,xgb2.predict(x_test_new))
              precision    recall  f1-score   support

           0       1.00      0.59      0.74      5363
           1       0.18      0.99      0.30       484

    accuracy                           0.62      5847
   macro avg       0.59      0.79      0.52      5847
weighted avg       0.93      0.62      0.71      5847

In [542]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
In [543]:
gsearch4 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch4.fit(x_train_smote, y_train_smote)
Out[543]:
GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=11, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=200, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=1, seed=27, subsample=0.8,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             iid=False, n_jobs=4,
             param_grid={'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
                         'subsample': [0.6, 0.7, 0.8, 0.9]},
             scoring='roc_auc')
In [544]:
gsearch4.best_params_,gsearch4.best_score_
Out[544]:
({'colsample_bytree': 0.6, 'subsample': 0.8}, 0.9991694112484719)
In [545]:
param_test5 = {
 'subsample':[i/100 for i in range(75,90,5)],
 'colsample_bytree':[i/10.0 for i in range(4,7)]
}
In [546]:
gsearch5 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.6,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch5.fit(x_train_smote, y_train_smote)
Out[546]:
GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.6, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=11, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=200, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=1, seed=27, subsample=0.8,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             iid=False, n_jobs=4,
             param_grid={'colsample_bytree': [0.4, 0.5, 0.6],
                         'subsample': [0.75, 0.8, 0.85]},
             scoring='roc_auc')
In [547]:
gsearch5.best_params_,gsearch5.best_score_
Out[547]:
({'colsample_bytree': 0.6, 'subsample': 0.85}, 0.9991718275457966)
In [552]:
param_test6 = {
 'subsample':[i/100 for i in range(80,90,2)],
 'colsample_bytree':[i/100 for i in range(58,65,2)]
}
In [553]:
gsearch6 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.6,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch6.fit(x_train_smote, y_train_smote)
Out[553]:
GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.6, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=11, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=200, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=1, seed=27, subsample=0.8,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             iid=False, n_jobs=4,
             param_grid={'colsample_bytree': [0.58, 0.6, 0.62, 0.64],
                         'subsample': [0.8, 0.82, 0.84, 0.86, 0.88]},
             scoring='roc_auc')
In [555]:
gsearch6.best_params_,gsearch6.best_score_
Out[555]:
({'colsample_bytree': 0.6, 'subsample': 0.8}, 0.9991694112484719)
In [556]:
param_test7 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
In [557]:
gsearch7= GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.6,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test7, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch7.fit(x_train_smote, y_train_smote)
Out[557]:
GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.6, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=11, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=200, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=1, seed=27, subsample=0.8,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             iid=False, n_jobs=4,
             param_grid={'reg_alpha': [1e-05, 0.01, 0.1, 1, 100]},
             scoring='roc_auc')
In [558]:
gsearch7.best_params_,gsearch7.best_score_
Out[558]:
({'reg_alpha': 1e-05}, 0.999176387012094)
In [562]:
param_test8 = {
 'reg_alpha':[0.001,0.002,0.003,0.004 ]
}
In [563]:
gsearch8= GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=1e-05,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.6,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test8, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch8.fit(x_train_smote, y_train_smote)
Out[563]:
GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.6, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=11, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=200, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=1e-05, reg_lambda=None,
                                     scale_pos_weight=1, seed=27, subsample=0.8,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             iid=False, n_jobs=4,
             param_grid={'reg_alpha': [0.001, 0.002, 0.003, 0.004]},
             scoring='roc_auc')
In [564]:
gsearch8.best_params_,gsearch8.best_score_
Out[564]:
({'reg_alpha': 0.001}, 0.9991973459791664)
In [1610]:
xgb3=xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=0.001,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.6,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
In [1611]:
xgb3=xgb3.fit(x_train_smote,y_train_smote)
In [1316]:
class_report(y_test_new,xgb3.predict(x_test_new))
              precision    recall  f1-score   support

           0       1.00      0.62      0.76      5363
           1       0.19      0.99      0.32       484

    accuracy                           0.65      5847
   macro avg       0.59      0.80      0.54      5847
weighted avg       0.93      0.65      0.73      5847

In [1317]:
class_report(y_train_new,xgb3.predict(x_train_new))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12386
           1       1.00      1.00      1.00      1257

    accuracy                           1.00     13643
   macro avg       1.00      1.00      1.00     13643
weighted avg       1.00      1.00      1.00     13643

In [573]:
param_test9 = {
 'learning_rate':[0.01,0.02,0.1]
}
In [574]:
gsearch9= GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=1e-05,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.6,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test9, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch9.fit(x_train_smote, y_train_smote)
Out[574]:
GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.6, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=11, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=200, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=1e-05, reg_lambda=None,
                                     scale_pos_weight=1, seed=27, subsample=0.8,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             iid=False, n_jobs=4,
             param_grid={'learning_rate': [0.01, 0.02, 0.1]},
             scoring='roc_auc')
In [575]:
gsearch9.best_params_,gsearch9.best_score_
Out[575]:
({'learning_rate': 0.1}, 0.999176387012094)
In [1318]:
xgb4=xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=1e-05,booster='gbtree',
 min_child_weight=44, gamma=0, subsample=0.8, colsample_bytree=0.6,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
xgb4=xgb4.fit(x_train_smote, y_train_smote)
In [1319]:
class_report(y_train_new,xgb4.predict(x_train_new))
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     12386
           1       0.91      0.96      0.93      1257

    accuracy                           0.99     13643
   macro avg       0.95      0.97      0.96     13643
weighted avg       0.99      0.99      0.99     13643

In [1320]:
class_report(y_test_new,xgb4.predict(x_test_new))
              precision    recall  f1-score   support

           0       1.00      0.53      0.69      5363
           1       0.16      1.00      0.28       484

    accuracy                           0.57      5847
   macro avg       0.58      0.76      0.48      5847
weighted avg       0.93      0.57      0.66      5847

In [1323]:
confusionmatrix(y_train_new,xgb4.predict(x_train_new))
In [1321]:
confusionmatrix(y_test_new,xgb4.predict(x_test_new))
In [1324]:
print('Scores and Curve for Testing data is ')
roc_plot(xgb4,x_test_new,y_test_new)
Scores and Curve for Testing data is 
roc_auc_score --> 0.9626870214185659
In [1325]:
print('Scores and Curve for Training data is ')
roc_plot(xgb4,x_train_new,y_train_new)
Scores and Curve for Training data is 
roc_auc_score --> 0.9970834728716346
In [1326]:
feature_importance=xgb4.get_booster().get_score(importance_type="gain")
feature_col = list(feature_importance.keys())
feature_value =list(feature_importance.values())
In [ ]:
[feature_value]
In [1327]:
plt.figure(figsize=(10,5))
sns.barplot(x=100*(xgb4.feature_importances_),y=x_train_smote.columns)
Out[1327]:
<matplotlib.axes._subplots.AxesSubplot at 0xc26ceba4a8>
In [673]:
xgb4.get_booster().get_score(importance_type='weight')
Out[673]:
{'PAID_AMOUNT': 109,
 'Percentage_Completion': 144,
 'BALANCE_TENURE': 143,
 'COMPLETED_TENURE': 174,
 'OUTSTANDING_PRINCIPAL': 66,
 'DIFF_CURRENT_INTEREST_RATE_MAX_MIN': 107,
 'EXCESS_ADJUSTED_AMT': 104,
 'EMI_DUEAMT': 88,
 'CURRENT_INTEREST_RATE_CHANGES': 56,
 'DUEDAY': 55,
 'NUM_LOW_FREQ_TRANSACTIONS': 259,
 'MEAN_EMI_AMOUNT': 77,
 'LATEST_TRANSACTION_MONTH': 245,
 'DPD': 13,
 'NET_LTV': 141,
 'FOIR': 161,
 'NUM_EMI_CHANGES': 288,
 'LAST_RECEIPT_AMOUNT': 83,
 'NET_RECEIVABLE': 120,
 'ORIGNAL_TENOR': 108,
 'DIFF_ORIGINAL_CURRENT_INTEREST_RATE': 112,
 'PRE_EMI_DUEAMT': 119,
 'Mean_Interest_Rate': 344,
 'EMI_OS_AMOUNT': 9}
In [1657]:
xgb4_normal=xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=1e-05,booster='gbtree',
 min_child_weight=44, gamma=0, subsample=0.8, colsample_bytree=0.6,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)

xgb4_normal=xgb4_normal.fit(x_train_new, y_train_new)
In [1658]:
## Performance Matrix on train data set
y_train_predict = xgb4_normal.predict(x_train_new)
model_score = xgb4_normal.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
0.9842410027120135
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     12386
           1       0.94      0.89      0.91      1257

    accuracy                           0.98     13643
   macro avg       0.96      0.94      0.95     13643
weighted avg       0.98      0.98      0.98     13643

In [1659]:
## Performance Matrix on test data set
y_test_predict = xgb4_normal.predict(x_test_new)
model_score = xgb4_normal.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test, xgb4_normal.predict(x_test_new))
class_report(y_test, xgb4_normal.predict(x_test_new))
0.9810159055926116
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5363
           1       0.91      0.85      0.88       484

    accuracy                           0.98      5847
   macro avg       0.95      0.92      0.94      5847
weighted avg       0.98      0.98      0.98      5847

In [1683]:
roc_plot(xgb4_normal,x_train_new,y_train_new)
roc_auc_score --> 0.9956976600342136
In [1684]:
roc_plot(xgb4_normal,x_test_new,y_test_new)
roc_auc_score --> 0.9921631688197212

Linear Discriminant Analysis

In [1612]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
In [1614]:
lda=LinearDiscriminantAnalysis()
In [1615]:
lda=lda.fit(x_train_smote, y_train_smote)
In [1617]:
## Performance Matrix on train data set
y_train_predict = lda.predict(x_train_new)
model_score = lda.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
0.9011214542256102
              precision    recall  f1-score   support

           0       0.99      0.90      0.94     12386
           1       0.48      0.95      0.64      1257

    accuracy                           0.90     13643
   macro avg       0.74      0.92      0.79     13643
weighted avg       0.95      0.90      0.91     13643

In [1616]:
## Performance Matrix on test data set
y_train_predict = lda.predict(x_test_new)
model_score = lda.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test, lda.predict(x_test_new))
class_report(y_test, lda.predict(x_test_new))
0.8917393535146229
              precision    recall  f1-score   support

           0       0.99      0.89      0.94      5363
           1       0.43      0.92      0.59       484

    accuracy                           0.89      5847
   macro avg       0.71      0.91      0.76      5847
weighted avg       0.95      0.89      0.91      5847

In [1618]:
roc_plot(lda,x_test_new, y_test_new)
roc_auc_score --> 0.9607503509661393
In [1619]:
roc_plot(lda,x_train_new, y_train_new)
roc_auc_score --> 0.9764205641368132
In [1660]:
lda_normal=LinearDiscriminantAnalysis()

lda_normal=lda_normal.fit(x_train_new, y_train_new)
In [1661]:
## Performance Matrix on train data set
y_train_predict = lda_normal.predict(x_train_new)
model_score = lda_normal.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
0.9396027266730191
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     12386
           1       0.68      0.64      0.66      1257

    accuracy                           0.94     13643
   macro avg       0.82      0.81      0.81     13643
weighted avg       0.94      0.94      0.94     13643

In [1662]:
## Performance Matrix on test data set
y_test_predict = lda_normal.predict(x_test_new)
model_score = lda_normal.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test, lda_normal.predict(x_test_new))
class_report(y_test, lda_normal.predict(x_test_new))
0.935180434410809
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      5363
           1       0.60      0.63      0.62       484

    accuracy                           0.94      5847
   macro avg       0.78      0.80      0.79      5847
weighted avg       0.94      0.94      0.94      5847

In [1685]:
roc_plot(lda_normal,x_train_new, y_train_new)

roc_plot(lda_normal,x_test_new, y_test_new)
roc_auc_score --> 0.9598425147287574
roc_auc_score --> 0.9445831785897558
In [ ]:

In [ ]:

BELOW CODES NOT TO BE CONSIDERED FOR EVALUATION

2nd Iteration

In [342]:
df_2.head()
Out[342]:
const BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT FORECLOSURE Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT LAP STHL STLAP PAID_AMOUNT
0 1.0 0 45 1 0.312138 -2.809242 0 1 8.614898e+06 175477.782641 2.135869e+05 0.60 1000000.0 5.0 40.06 175477.782641 1 1 120 1.164472e+05 7994.273589 1 100.000000 13.525980 389039.333333 0 0 0 1.431938e+07
1 1.0 99 38 0 0.000000 -3.121380 0 1 1.061903e+07 279448.084560 0.000000e+00 0.60 126530.0 11.0 84.31 279448.084560 0 0 180 1.234760e+07 100504.575864 1 27.737226 13.734072 126530.000000 0 0 0 1.188781e+07
2 1.0 231 81 2 0.624276 -4.057794 0 5 4.670211e+06 0.000000 1.179716e+06 0.72 22878.0 8.0 50.89 0.000000 2 3 180 2.994184e+06 5947.630536 1 25.961538 15.814992 181918.666667 0 0 0 3.929335e+06
3 1.0 0 91 4 2.497104 -0.624276 0 1 1.313098e+07 63659.280762 6.770848e+06 0.60 65741.0 5.0 84.63 63659.280762 3 8 180 4.295120e+04 60322.180776 1 100.000000 12.069336 543827.333333 0 0 0 1.462863e+07
4 1.0 215 89 2 0.624276 -4.057794 0 5 1.048923e+07 0.000000 1.202181e+05 0.83 54433.0 4.0 30.94 0.000000 0 3 180 7.200653e+06 27732.787464 1 29.276316 15.190716 54433.000000 0 0 0 7.149063e+06
In [343]:
df4=df_2.copy()
In [1905]:
from sklearn.model_selection import train_test_split

x=df4.drop(['FORECLOSURE'],axis=1)

y=df4.pop('FORECLOSURE')

x_new=x.copy()
y_new=y.copy()

x_train_new,x_test_new,y_train_new,y_test_new=train_test_split(x_new,y_new,test_size=0.3,random_state=0)

#/**x_new_copy=pd.DataFrame()
#x_new_copy['LAP']=x_train_new['LAP']
#x_new_copy['STHL']=x_train_new['STHL']
#x_new_copy['STLAP']=x_train_new['STLAP']

#x_new_copy.head(2)

#x_new_copy.reset_index(drop=True,inplace=True)''''''**/
In [1906]:
### Z score Transformation

from sklearn.preprocessing import StandardScaler

x_train_new.drop(['const'],axis=1,inplace=True)

x_test_new.drop(['const'],axis=1,inplace=True)

x_train_new.shape
Out[1906]:
(13643, 27)
In [1907]:
list_col=['COMPLETED_TENURE', 'DIFF_CURRENT_INTEREST_RATE_MAX_MIN', \
       'DIFF_ORIGINAL_CURRENT_INTEREST_RATE', 'EMI_DUEAMT', \
       'LATEST_TRANSACTION_MONTH', 'Mean_Interest_Rate', 'PAID_AMOUNT', 'LAP', \
       'STLAP']
for col in x_train_new.columns:
    if col not in list_col:
        x_train_new.drop([col],axis=1,inplace=True)
        x_test_new.drop([col],axis=1,inplace=True)
In [1908]:
print(x_train_new.shape)
print(x_test_new.shape)
(13643, 9)
(5847, 9)
In [1909]:
x_train_new.head(2)
Out[1909]:
COMPLETED_TENURE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE EMI_DUEAMT LATEST_TRANSACTION_MONTH Mean_Interest_Rate LAP STLAP PAID_AMOUNT
16187 6 0.000000 0.000000 172439.323056 12.0 19.040418 0 1 120139.695099
11072 9 0.499421 -0.499421 180741.270024 9.0 14.067019 1 0 130201.379551
In [1910]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

x_train_new=pd.DataFrame(ss.fit_transform(x_train_new),columns = x_train_new.columns)
In [1911]:
x_train_new.head()
Out[1911]:
COMPLETED_TENURE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE EMI_DUEAMT LATEST_TRANSACTION_MONTH Mean_Interest_Rate LAP STLAP PAID_AMOUNT
0 -0.705212 -0.622898 0.450196 -0.262438 0.463596 1.737300 -0.671965 2.339890 -0.260036
1 -0.523551 -0.111365 -0.117511 -0.261256 -0.604188 -0.237325 1.488172 -0.427371 -0.258269
2 0.324202 -0.622898 0.450196 -0.231709 0.463596 0.126204 -0.671965 -0.427371 -0.151162
3 -0.281336 -0.622898 0.450196 -0.175345 -0.604188 1.365509 -0.671965 2.339890 -0.186443
4 0.203095 -0.622898 0.450196 -0.161634 0.463596 0.745857 -0.671965 2.339890 -0.172700
In [1717]:
x_test_new.head()
Out[1717]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT LAP STHL STLAP
0 -0.044464 0.626971 -0.676893 -0.622898 -0.614255 -0.118447 3.366490 -0.002305 -0.065941 -0.107284 0.566118 -0.053824 -2.027899 -0.450178 0.035201 -1.145959 -0.705400 -0.070397 -0.128890 -0.118697 0.198709 0.869787 -0.068570 -0.039478 1 0 0
1 0.533790 -0.886874 0.200694 0.655934 -0.969073 -0.118447 -0.287603 -0.259112 -0.065941 -0.107284 -0.280311 -0.056991 0.463596 0.481554 0.035201 -0.382359 -1.091680 -0.070397 -0.090971 -0.183111 -0.714189 -0.534758 -0.101512 -0.256752 1 0 0
2 1.033902 1.959155 -0.676893 -0.622898 -3.452793 -0.118447 3.366490 0.730623 -0.065941 -0.107284 0.279193 -0.090194 -1.671972 0.447483 0.035201 0.763040 0.067159 -0.159898 0.228645 -0.107490 0.425598 1.365509 -0.034073 0.575544 1 0 0
3 0.033679 -0.765766 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.250668 -0.065941 -0.107284 -0.380735 -0.064337 0.107668 -0.620057 0.035001 -0.382359 0.067159 -0.070397 -0.241283 -0.173687 -0.617304 1.960375 -0.111923 -0.249393 0 0 1
4 0.987017 -0.826320 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.274526 -0.065941 -0.107284 -0.165541 -0.079361 0.463596 0.048575 0.033393 -0.000560 -0.319120 1.272128 -0.331136 -0.203912 -0.697091 0.126204 -0.138889 -0.270291 0 1 0
In [1718]:
x_train_smote.head()
Out[1718]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT LAP STHL STLAP
0 0.018050 -0.705212 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.262438 -0.065941 -0.107284 -0.122503 -0.075827 0.463596 -1.732552 0.013175 -0.764159 -0.319120 -0.070397 -0.329322 -0.199913 -0.577410 1.737300 -0.133733 -0.260036 0 0 1
1 0.127450 -0.523551 0.200694 -0.111365 -0.117511 -0.118447 -0.287603 -0.261256 -0.065941 -0.107284 -0.438120 -0.080292 -0.604188 -1.386642 0.035201 -0.382359 -0.705400 -0.070397 -0.345788 -0.173526 -0.476625 -0.237325 -0.142860 -0.258269 1 0 0
2 0.705704 0.324202 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.231709 -0.065941 0.136986 -0.309004 -0.083243 0.463596 -0.038967 0.035201 0.381240 0.067159 1.272128 -0.373014 -0.198362 -0.131458 0.126204 -0.143385 -0.151162 0 1 0
3 -0.091349 -0.281336 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.175345 -0.065941 -0.107284 0.207461 -0.059544 -0.604188 0.053780 0.035201 -0.382359 -0.319120 -0.070397 -0.192704 -0.181246 -0.298152 1.365509 -0.108466 -0.186443 0 0 1
4 -0.216377 0.203095 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.161634 -0.065941 -0.107284 -0.265965 -0.069028 0.463596 1.138356 0.035201 -0.382359 -0.705400 -0.070397 -0.262827 -0.104943 0.020999 0.745857 -0.124190 -0.172700 0 0 1
In [1728]:
RF_model.feature_importances_.T
Out[1728]:
array([0.00611439, 0.03779938, 0.02023371, 0.03666159, 0.03117104,
       0.00167769, 0.01115934, 0.0458451 , 0.00062604, 0.01027064,
       0.00321275, 0.00418426, 0.40654435, 0.00180694, 0.0015369 ,
       0.00913713, 0.00569462, 0.01203198, 0.00423151, 0.00807983,
       0.02011967, 0.0451992 , 0.0080063 , 0.04132782, 0.16212506,
       0.0065701 , 0.05863266])
In [1781]:
feature_important=pd.DataFrame(columns=x_train_smote.columns)
In [1784]:
i=0
for col in feature_important.columns:
    feature_important.loc[0,col]=100* RF_model.feature_importances_[i]
    i = i+1
In [1777]:
for col in feature_important.columns:
    print(col)
BALANCE_TENURE
COMPLETED_TENURE
CURRENT_INTEREST_RATE_CHANGES
DIFF_CURRENT_INTEREST_RATE_MAX_MIN
DIFF_ORIGINAL_CURRENT_INTEREST_RATE
DPD
DUEDAY
EMI_DUEAMT
EMI_OS_AMOUNT
EXCESS_ADJUSTED_AMT
FOIR
LAST_RECEIPT_AMOUNT
LATEST_TRANSACTION_MONTH
NET_LTV
NET_RECEIVABLE
NUM_EMI_CHANGES
NUM_LOW_FREQ_TRANSACTIONS
ORIGNAL_TENOR
OUTSTANDING_PRINCIPAL
PRE_EMI_DUEAMT
Percentage_Completion
Mean_Interest_Rate
MEAN_EMI_AMOUNT
PAID_AMOUNT
LAP
STHL
STLAP
In [1765]:
#feature_important.loc[0,'BALANCE_TENURE']=RF_model.feature_importances_[0]
In [1799]:
feature_important.columns
Out[1799]:
Index(['COMPLETED_TENURE', 'DIFF_CURRENT_INTEREST_RATE_MAX_MIN',
       'DIFF_ORIGINAL_CURRENT_INTEREST_RATE', 'EMI_DUEAMT',
       'LATEST_TRANSACTION_MONTH', 'Mean_Interest_Rate', 'PAID_AMOUNT', 'LAP',
       'STLAP'],
      dtype='object')
In [1803]:
feature_important.shape
Out[1803]:
(1, 9)
In [1791]:
[feature_important.drop([col],axis=1,inplace=True) for col in feature_important.columns if feature_important.loc[0,col] < 3]
Out[1791]:
[None, None]
In [1809]:
x_train_new_copy=x_train_new.copy()
x_test_new_copy=x_test_new.copy()
x_train_smote_copy=x_train_smote.copy()
In [1810]:
list_col=['COMPLETED_TENURE', 'DIFF_CURRENT_INTEREST_RATE_MAX_MIN', \
       'DIFF_ORIGINAL_CURRENT_INTEREST_RATE', 'EMI_DUEAMT', \
       'LATEST_TRANSACTION_MONTH', 'Mean_Interest_Rate', 'PAID_AMOUNT', 'LAP', \
       'STLAP']
for col in x_train_new_copy.columns:
    if col not in list_col:
        x_train_new_copy.drop([col],axis=1,inplace=True)
        x_test_new_copy.drop([col],axis=1,inplace=True)
        x_train_smote_copy.drop([col],axis=1,inplace=True)
In [1920]:
df_2[['COMPLETED_TENURE', 'DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE', 'EMI_DUEAMT', \
       'LATEST_TRANSACTION_MONTH', 'Mean_Interest_Rate', 'PAID_AMOUNT', 'LAP','STLAP','FORECLOSURE']].head()
Out[1920]:
COMPLETED_TENURE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE EMI_DUEAMT LATEST_TRANSACTION_MONTH Mean_Interest_Rate PAID_AMOUNT LAP STLAP FORECLOSURE
0 45 0.312138 -2.809242 8.614898e+06 5.0 13.525980 1.431938e+07 0 0 1
1 38 0.000000 -3.121380 1.061903e+07 11.0 13.734072 1.188781e+07 0 0 1
2 81 0.624276 -4.057794 4.670211e+06 8.0 15.814992 3.929335e+06 0 0 1
3 91 2.497104 -0.624276 1.313098e+07 5.0 12.069336 1.462863e+07 0 0 1
4 89 0.624276 -4.057794 1.048923e+07 4.0 15.190716 7.149063e+06 0 0 1
In [1582]:
from sklearn.model_selection import GridSearchCV
In [1818]:
param_grid = {
    'min_samples_split' : [40,50,55],
    'min_samples_leaf':[30,40,45],
    'max_depth':[10,15,20],
    'random_state' : [0]
}
In [1819]:
rf_deploy=RandomForestClassifier()
In [1820]:
grid_search_deploy=GridSearchCV(estimator=rf_deploy,param_grid=param_grid,cv=10)
In [1821]:
grid_search_deploy.fit(x_train_smote_copy,y_train_smote)
Out[1821]:
GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [10, 15, 20],
                         'min_samples_leaf': [30, 40, 45],
                         'min_samples_split': [40, 50, 55],
                         'random_state': [0]})
In [1587]:
grid_search_deploy.best_estimator_
Out[1587]:
RandomForestClassifier(max_depth=15, min_samples_leaf=40, min_samples_split=50,
                       random_state=0)
In [ ]:
ss.inverse_transform
In [1822]:
rf_deploy=RandomForestClassifier(max_depth=15, min_samples_leaf=40, min_samples_split=50,
                       random_state=0)
In [1823]:
rf_deploy=rf_deploy.fit(x_train_smote_copy,y_train_smote)
In [1824]:
class_report(y_train_new,rf_deploy.predict(x_train_new_copy))
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     12386
           1       0.81      0.94      0.87      1257

    accuracy                           0.97     13643
   macro avg       0.90      0.96      0.93     13643
weighted avg       0.98      0.97      0.98     13643

In [1825]:
class_report(y_test_new,rf_deploy.predict(x_test_new_copy))
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5363
           1       0.82      0.92      0.86       484

    accuracy                           0.98      5847
   macro avg       0.90      0.95      0.93      5847
weighted avg       0.98      0.98      0.98      5847

In [1827]:
confusionmatrix(y_train_new,rf_deploy.predict(x_train_new_copy))
In [1826]:
confusionmatrix(y_test_new,rf_deploy.predict(x_test_new_copy))
In [1842]:
x_test_new_copy.head(1)
Out[1842]:
COMPLETED_TENURE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE EMI_DUEAMT LATEST_TRANSACTION_MONTH Mean_Interest_Rate PAID_AMOUNT LAP STLAP
0 0.626971 -0.622898 -0.614255 -0.002305 -2.027899 0.869787 -0.039478 1 0
In [1852]:
df_2.head(2)
Out[1852]:
const BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT FORECLOSURE Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT LAP STHL STLAP PAID_AMOUNT
0 1.0 0 45 1 0.312138 -2.809242 0 1 8.614898e+06 175477.782641 213586.855368 0.6 1000000.0 5.0 40.06 175477.782641 1 1 120 1.164472e+05 7994.273589 1 100.000000 13.525980 389039.333333 0 0 0 1.431938e+07
1 1.0 99 38 0 0.000000 -3.121380 0 1 1.061903e+07 279448.084560 0.000000 0.6 126530.0 11.0 84.31 279448.084560 0 0 180 1.234760e+07 100504.575864 1 27.737226 13.734072 126530.000000 0 0 0 1.188781e+07
In [1853]:
x_train_new_copy.head(3)
Out[1853]:
COMPLETED_TENURE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE EMI_DUEAMT LATEST_TRANSACTION_MONTH Mean_Interest_Rate PAID_AMOUNT LAP STLAP
0 -0.705212 -0.622898 0.450196 -0.262438 0.463596 1.737300 -0.260036 0 1
1 -0.523551 -0.111365 -0.117511 -0.261256 -0.604188 -0.237325 -0.258269 1 0
2 0.324202 -0.622898 0.450196 -0.231709 0.463596 0.126204 -0.151162 0 0
In [1865]:
ss.mean_.shape
Out[1865]:
(24,)
In [1866]:
a=x_train_new.copy()
In [1868]:
a.drop(['LAP','STHL','STLAP'],axis=1,inplace=True)
In [1880]:
scaling_features=pd.DataFrame(data=ss.mean_.reshape(1,24) , columns=a.columns)
In [1882]:
scaling_features.loc[1]=ss.var_
In [1883]:
scaling_features
Out[1883]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT
0 172.845049 17.646046 0.771311 0.608149 -0.396044 6.597449 5.787070 2.015643e+06 2.687794e+04 3.054849e+05 0.605390 7.959314e+04 10.697501 50.933477 -4.593083e+04 3.001466 2.826138 183.146156 5.151292e+06 5.354039e+04 11.374233 14.664759 5.654482e+04 1.600412e+06
1 4094.169936 272.720205 1.298431 0.953205 0.773899 3102.421549 7.489294 4.932781e+13 1.661408e+11 8.107846e+12 0.485873 7.709679e+11 7.893615 446.590385 1.702585e+12 6.860073 6.701869 1997.367043 1.320841e+14 6.067323e+10 193.928357 6.343619 1.246777e+11 3.240525e+13
In [1884]:
scaling_features.iloc[1]=np.sqrt(scaling_features.iloc[1])
In [1885]:
scaling_features
Out[1885]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT
0 172.845049 17.646046 0.771311 0.608149 -0.396044 6.597449 5.787070 2.015643e+06 26877.935002 3.054849e+05 0.605390 79593.136642 10.697501 50.933477 -4.593083e+04 3.001466 2.826138 183.146156 5.151292e+06 53540.389217 11.374233 14.664759 56544.823888 1.600412e+06
1 63.985701 16.514242 1.139487 0.976322 0.879715 55.699386 2.736657 7.023376e+06 407603.676889 2.847428e+06 0.697046 878047.771308 2.809558 21.132685 1.304831e+06 2.619174 2.588797 44.691912 1.149279e+07 246319.368207 13.925816 2.518654 353097.361974 5.692561e+06
In [1886]:
list_col=['COMPLETED_TENURE', 'DIFF_CURRENT_INTEREST_RATE_MAX_MIN', \
       'DIFF_ORIGINAL_CURRENT_INTEREST_RATE', 'EMI_DUEAMT', \
       'LATEST_TRANSACTION_MONTH', 'Mean_Interest_Rate', 'PAID_AMOUNT', 'LAP', \
       'STLAP']
[scaling_features.drop([col],axis=1,inplace=True) for col in scaling_features.columns if col not in list_col ]
Out[1886]:
[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]
In [1888]:
scaling_features.head()
Out[1888]:
COMPLETED_TENURE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE EMI_DUEAMT LATEST_TRANSACTION_MONTH Mean_Interest_Rate PAID_AMOUNT
0 17.646046 0.608149 -0.396044 2.015643e+06 10.697501 14.664759 1.600412e+06
1 16.514242 0.976322 0.879715 7.023376e+06 2.809558 2.518654 5.692561e+06

Deployment

In [1828]:
import pickle
In [1830]:
ss.mean_
Out[1830]:
array([ 1.72845049e+02,  1.76460456e+01,  7.71311295e-01,  6.08148836e-01,
       -3.96044278e-01,  6.59744924e+00,  5.78707029e+00,  2.01564305e+06,
        2.68779350e+04,  3.05484863e+05,  6.05389922e-01,  7.95931366e+04,
        1.06975005e+01,  5.09334765e+01, -4.59308322e+04,  3.00146595e+00,
        2.82613795e+00,  1.83146156e+02,  5.15129170e+06,  5.35403892e+04,
        1.13742332e+01,  1.46647594e+01,  5.65448239e+04,  1.60041234e+06])
In [1835]:
rf_file=open('rf.pkl','wb')
pickle.dump(rf_deploy,rf_file)
loaded_model=pickle.load(open('rf.pkl','rb'))
print(loaded_model.score(x_train_new_copy,y_train_new))
0.974712306677417
In [1912]:
pickle.dump(ss,open('scaler.pkl','wb'))
In [1840]:
rf_deploy.feature_importances_.reshape(9,1)
Out[1840]:
array([[0.03025541],
       [0.02054905],
       [0.02322911],
       [0.04386066],
       [0.57496983],
       [0.03216985],
       [0.04933691],
       [0.17748843],
       [0.04814076]])
In [1839]:
rf_deploy.feature_importances_
Out[1839]:
array([0.03025541, 0.02054905, 0.02322911, 0.04386066, 0.57496983,
       0.03216985, 0.04933691, 0.17748843, 0.04814076])
In [1846]:
type([[0.626971,0.622898,0.6142,0.002305,2.02789,0.869787,0.03947,1,0]])
Out[1846]:
list
In [1811]:
x_train_new_copy.shape
Out[1811]:
(13643, 9)
In [1813]:
x_train_smote_copy.head()
Out[1813]:
COMPLETED_TENURE DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE EMI_DUEAMT LATEST_TRANSACTION_MONTH Mean_Interest_Rate PAID_AMOUNT LAP STLAP
0 -0.705212 -0.622898 0.450196 -0.262438 0.463596 1.737300 -0.260036 0 1
1 -0.523551 -0.111365 -0.117511 -0.261256 -0.604188 -0.237325 -0.258269 1 0
2 0.324202 -0.622898 0.450196 -0.231709 0.463596 0.126204 -0.151162 0 0
3 -0.281336 -0.622898 0.450196 -0.175345 -0.604188 1.365509 -0.186443 0 1
4 0.203095 -0.622898 0.450196 -0.161634 0.463596 0.745857 -0.172700 0 1
In [1748]:
RF_model.feature_importances_[0]
Out[1748]:
0.006114394510190951
In [1729]:
pd.DataFrame(100*(RF_model.feature_importances_.T))#.T,columns=x_train_smote.columns
Out[1729]:
0
0 0.611439
1 3.779938
2 2.023371
3 3.666159
4 3.117104
5 0.167769
6 1.115934
7 4.584510
8 0.062604
9 1.027064
10 0.321275
11 0.418426
12 40.654435
13 0.180694
14 0.153690
15 0.913713
16 0.569462
17 1.203198
18 0.423151
19 0.807983
20 2.011967
21 4.519920
22 0.800630
23 4.132782
24 16.212506
25 0.657010
26 5.863266
In [1724]:
x_train_smote.columns
Out[1724]:
Index(['BALANCE_TENURE', 'COMPLETED_TENURE', 'CURRENT_INTEREST_RATE_CHANGES',
       'DIFF_CURRENT_INTEREST_RATE_MAX_MIN',
       'DIFF_ORIGINAL_CURRENT_INTEREST_RATE', 'DPD', 'DUEDAY', 'EMI_DUEAMT',
       'EMI_OS_AMOUNT', 'EXCESS_ADJUSTED_AMT', 'FOIR', 'LAST_RECEIPT_AMOUNT',
       'LATEST_TRANSACTION_MONTH', 'NET_LTV', 'NET_RECEIVABLE',
       'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS', 'ORIGNAL_TENOR',
       'OUTSTANDING_PRINCIPAL', 'PRE_EMI_DUEAMT', 'Percentage_Completion',
       'Mean_Interest_Rate', 'MEAN_EMI_AMOUNT', 'PAID_AMOUNT', 'LAP', 'STHL',
       'STLAP'],
      dtype='object')
In [945]:
x_new=x.copy()
y_new=y.copy()
In [946]:
x_new['LAP']=df_2['LAP']
x_new['STHL']=df_2['STHL']
x_new['STLAP']=df_2['STLAP']
In [947]:
#x_new.drop(['LATEST_TRANSACTION_MONTH'],axis=1,inplace=True)
In [948]:
x_train_new,x_test_new,y_train_new,y_test_new=train_test_split(x_new,y_new,test_size=0.3,random_state=0)
In [949]:
#Validation Set
#x_test_1,x_test_final,y_test_1,y_test_final=train_test_split(x_test_new,y_test_new,test_size=0.3,random_state=0)
In [950]:
sm_new = SMOTE(random_state=0)
x_train_new,y_train_new=sm_new.fit_resample(x_train_new,y_train_new)
In [1059]:
x_train_new.head()
Out[1059]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT LAP STHL STLAP
16187 0.018050 -0.705212 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.262438 -0.065941 -0.107284 -0.122503 -0.075827 0.463596 -1.732552 0.013175 -0.764159 -0.319120 -0.070397 -0.329322 -0.199913 -0.577410 1.737300 -0.133733 -0.260036 0 0 1
11072 0.127450 -0.523551 0.200694 -0.111365 -0.117511 -0.118447 -0.287603 -0.261256 -0.065941 -0.107284 -0.438120 -0.080292 -0.604188 -1.386642 0.035201 -0.382359 -0.705400 -0.070397 -0.345788 -0.173526 -0.476625 -0.237325 -0.142860 -0.258269 1 0 0
7393 0.705704 0.324202 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.231709 -0.065941 0.136986 -0.309004 -0.083243 0.463596 -0.038967 0.035201 0.381240 0.067159 1.272128 -0.373014 -0.198362 -0.131458 0.126204 -0.143385 -0.151162 0 1 0
6277 -0.091349 -0.281336 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.175345 -0.065941 -0.107284 0.207461 -0.059544 -0.604188 0.053780 0.035201 -0.382359 -0.319120 -0.070397 -0.192704 -0.181246 -0.298152 1.365509 -0.108466 -0.186443 0 0 1
7797 -0.216377 0.203095 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.161634 -0.065941 -0.107284 -0.265965 -0.069028 0.463596 1.138356 0.035201 -0.382359 -0.705400 -0.070397 -0.262827 -0.104943 0.020999 0.745857 -0.124190 -0.172700 0 0 1
In [1060]:
y_train_new.head()
Out[1060]:
16187    0
11072    0
7393     0
6277     0
7797     0
Name: FORECLOSURE, dtype: int64
In [1061]:
xgb4_new=xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=1e-05,booster='gbtree',
 min_child_weight=44, gamma=0, subsample=0.8, colsample_bytree=0.6,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
xgb4_new=xgb4_new.fit(x_train_new, y_train_new)#44
In [1062]:
x_train_new.head()
Out[1062]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT LAP STHL STLAP
16187 0.018050 -0.705212 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.262438 -0.065941 -0.107284 -0.122503 -0.075827 0.463596 -1.732552 0.013175 -0.764159 -0.319120 -0.070397 -0.329322 -0.199913 -0.577410 1.737300 -0.133733 -0.260036 0 0 1
11072 0.127450 -0.523551 0.200694 -0.111365 -0.117511 -0.118447 -0.287603 -0.261256 -0.065941 -0.107284 -0.438120 -0.080292 -0.604188 -1.386642 0.035201 -0.382359 -0.705400 -0.070397 -0.345788 -0.173526 -0.476625 -0.237325 -0.142860 -0.258269 1 0 0
7393 0.705704 0.324202 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.231709 -0.065941 0.136986 -0.309004 -0.083243 0.463596 -0.038967 0.035201 0.381240 0.067159 1.272128 -0.373014 -0.198362 -0.131458 0.126204 -0.143385 -0.151162 0 1 0
6277 -0.091349 -0.281336 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.175345 -0.065941 -0.107284 0.207461 -0.059544 -0.604188 0.053780 0.035201 -0.382359 -0.319120 -0.070397 -0.192704 -0.181246 -0.298152 1.365509 -0.108466 -0.186443 0 0 1
7797 -0.216377 0.203095 -0.676893 -0.622898 0.450196 -0.118447 -0.287603 -0.161634 -0.065941 -0.107284 -0.265965 -0.069028 0.463596 1.138356 0.035201 -0.382359 -0.705400 -0.070397 -0.262827 -0.104943 0.020999 0.745857 -0.124190 -0.172700 0 0 1
In [1069]:
class_report(y_train_new,xgb4_new.predict(x_train_new))
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     12386
           1       0.94      0.89      0.91      1257

    accuracy                           0.98     13643
   macro avg       0.96      0.94      0.95     13643
weighted avg       0.98      0.98      0.98     13643

In [1068]:
x_train_new.shape
Out[1068]:
(13643, 27)
In [1067]:
x_test_new.shape
Out[1067]:
(5847, 27)
In [1076]:
x_test_new=x_test_new.apply(zscore)
In [1077]:
x_test_new.head()
Out[1077]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT LAP STHL STLAP
185 -0.041270 0.656487 -0.681889 -0.647378 -0.618729 -0.112538 3.477847 -0.003557 -0.042509 -0.082594 0.804154 -0.083840 -2.060534 -0.464719 0.036315 -1.149551 -0.710552 -0.062978 -0.140337 -0.076642 0.202298 0.883518 -0.092871 -0.016888 1.460630 -0.744186 -0.426531
17908 0.543314 -0.883067 0.190923 0.660599 -0.965817 -0.112538 -0.278983 -0.276386 -0.042509 -0.082594 -0.375634 -0.088186 0.449454 0.478696 0.036315 -0.385101 -1.114382 -0.062978 -0.102714 -0.104579 -0.702058 -0.529412 -0.131241 -0.036100 1.460630 -0.744186 -0.426531
51 1.048901 2.011294 -0.681889 -0.647378 -3.395430 -0.112538 3.477847 0.775097 -0.042509 -0.082594 0.404226 -0.133740 -1.701965 0.444198 0.036315 0.761573 0.097107 -0.154046 0.214405 -0.071781 0.427065 1.382199 -0.052690 0.037494 1.460630 -0.744186 -0.426531
15694 0.037728 -0.759903 -0.681889 -0.647378 0.422533 -0.112538 -0.278983 -0.267414 -0.042509 -0.082594 -0.515609 -0.098264 0.090884 -0.636728 0.036134 -0.385101 0.097107 -0.062978 -0.251852 -0.100492 -0.606079 1.980616 -0.143369 -0.035449 -0.684636 -0.744186 2.344497
17451 1.001502 -0.821485 -0.681889 -0.647378 0.422533 -0.112538 -0.278983 -0.292761 -0.042509 -0.082594 -0.215663 -0.118877 0.449454 0.040288 0.034677 -0.002876 -0.306723 1.303038 -0.341003 -0.113601 -0.685120 0.135496 -0.174778 -0.037297 -0.684636 1.343749 -0.426531
In [1078]:
class_report(y_test_new,xgb4_new.predict(x_test_new)) #88 94 91 98 without latest_Transaction
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5363
           1       0.83      0.88      0.85       484

    accuracy                           0.97      5847
   macro avg       0.91      0.93      0.92      5847
weighted avg       0.98      0.97      0.98      5847

In [955]:
plt.figure(figsize=(10,5))
sns.barplot(x=100*(xgb4_new.feature_importances_),y=x_test_new.columns)
Out[955]:
<matplotlib.axes._subplots.AxesSubplot at 0xc24de17cf8>
In [973]:
x_new_smote=x_train_smote[['LATEST_TRANSACTION_MONTH','LAP','STHL','STLAP','EMI_DUEAMT','CURRENT_INTEREST_RATE_CHANGES'\
                        ,'DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
                        'Mean_Interest_Rate','DPD','DUEDAY','ORIGNAL_TENOR','PAID_AMOUNT']] #'LATEST_TRANSACTION_MONTH',
In [ ]:
x_train_new_1=x_train_new[['LATEST_TRANSACTION_MONTH','LAP','STHL','STLAP','EMI_DUEAMT','CURRENT_INTEREST_RATE_CHANGES'\
                        ,'DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
                        'Mean_Interest_Rate','DPD','DUEDAY','ORIGNAL_TENOR','PAID_AMOUNT']]
In [974]:
x_test_new_1=x_test_new[['LATEST_TRANSACTION_MONTH','LAP','STHL','STLAP','EMI_DUEAMT','CURRENT_INTEREST_RATE_CHANGES'\
                        ,'DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
                        'Mean_Interest_Rate','DPD','DUEDAY','ORIGNAL_TENOR','PAID_AMOUNT']]#'LATEST_TRANSACTION_MONTH',
In [975]:
x_new_smote.head()
Out[975]:
LATEST_TRANSACTION_MONTH LAP STHL STLAP EMI_DUEAMT CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE Mean_Interest_Rate DPD DUEDAY ORIGNAL_TENOR PAID_AMOUNT
0 0.459358 0 0 1 -0.267363 -0.67839 -0.630086 0.441731 1.742930 -0.115626 -0.285036 -0.068195 -0.048871
1 -0.610751 1 0 0 -0.266160 0.19775 -0.115144 -0.122161 -0.235195 -0.115626 -0.285036 -0.068195 -0.048588
2 0.459358 0 1 0 -0.236095 -0.67839 -0.630086 0.441731 0.128979 -0.115626 -0.285036 1.281239 -0.031454
3 -0.610751 0 0 1 -0.178741 -0.67839 -0.630086 0.441731 1.370480 -0.115626 -0.285036 -0.068195 -0.037098
4 0.459358 0 0 1 -0.164790 -0.67839 -0.630086 0.441731 0.749729 -0.115626 -0.285036 -0.068195 -0.034899
In [976]:
xgb4_new=xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=1e-05,booster='gbtree',
 min_child_weight=44, gamma=0, subsample=0.8, colsample_bytree=0.6,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)

xgb4_new=xgb4_new.fit(x_new_smote, y_train_new)
In [977]:
class_report(y_train_new,xgb4_new.predict(x_new_smote))
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     12386
           1       0.98      0.99      0.99     12386

    accuracy                           0.99     24772
   macro avg       0.99      0.99      0.99     24772
weighted avg       0.99      0.99      0.99     24772

In [978]:
class_report(y_test_new,xgb4_new.predict(x_test_new_smote))  #83 95 89
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5363
           1       0.85      0.94      0.89       484

    accuracy                           0.98      5847
   macro avg       0.92      0.96      0.94      5847
weighted avg       0.98      0.98      0.98      5847

In [979]:
confusionmatrix(y_train_new,xgb4_new.predict(x_new_smote))
In [980]:
confusionmatrix(y_test_new,xgb4_new.predict(x_test_new_smote))
In [981]:
plt.figure(figsize=(10,5))
sns.barplot(x=100*(xgb4_new.feature_importances_),y=x_test_new_smote.columns)
Out[981]:
<matplotlib.axes._subplots.AxesSubplot at 0xc2536e27f0>
In [982]:
print('Scores and Curve for Training data is ')
roc_plot(xgb4_new,x_new_smote,y_train_new)
Scores and Curve for Training data is 
roc_auc_score --> 0.9981599863938515
In [983]:
print('Scores and Curve for Training data is ')
roc_plot(xgb4_new,x_test_new_smote,y_test_new)
Scores and Curve for Training data is 
roc_auc_score --> 0.9924380473492233
In [1037]:
x_test.head(2)
Out[1037]:
BALANCE_TENURE COMPLETED_TENURE CURRENT_INTEREST_RATE_CHANGES DIFF_CURRENT_INTEREST_RATE_MAX_MIN DIFF_ORIGINAL_CURRENT_INTEREST_RATE DPD DUEDAY EMI_DUEAMT EMI_OS_AMOUNT EXCESS_ADJUSTED_AMT FOIR LAST_RECEIPT_AMOUNT LATEST_TRANSACTION_MONTH NET_LTV NET_RECEIVABLE NUM_EMI_CHANGES NUM_LOW_FREQ_TRANSACTIONS ORIGNAL_TENOR OUTSTANDING_PRINCIPAL PRE_EMI_DUEAMT Percentage_Completion Mean_Interest_Rate MEAN_EMI_AMOUNT PAID_AMOUNT
185 -0.043512 0.635680 -0.67839 -0.630086 -0.615566 -0.115626 3.000000 -0.002664 -0.049122 -0.094216 0.615953 -0.060418 -2.037562 -0.454491 0.035521 -1.147035 -0.706704 -0.068195 -0.132340 -0.089830 0.199790 0.873879 -0.074934 -0.013587
17908 0.536618 -0.885687 0.19775 0.657270 -0.967998 -0.115626 -0.285036 -0.263978 -0.049122 -0.094216 -0.299739 -0.063835 0.459358 0.480692 0.035521 -0.383181 -1.097979 -0.068195 -0.094511 -0.132344 -0.710517 -0.533155 -0.109256 -0.048345
In [ ]: